diff --git a/arch/x64/arch-mmu.hh b/arch/x64/arch-mmu.hh index 5aaa6236bfb98ff9eb6e36b298b15ad89ec45da9..dd4c60c8cbbebc8d8d806131a9d87b97c7d41893 100644 --- a/arch/x64/arch-mmu.hh +++ b/arch/x64/arch-mmu.hh @@ -9,7 +9,8 @@ #define ARCH_MMU_HH_ #include <osv/ilog2.hh> -#include <osv/mmu.hh> +#include <osv/types.h> +#include <osv/mmu-defs.hh> namespace mmu { @@ -68,19 +69,20 @@ class hw_ptep { public: hw_ptep(const hw_ptep& a) : p(a.p) {} pt_element read() const { return *p; } - void write(pt_element pte) { *const_cast<volatile u64*>(&p->x) = pte.x; } - bool compare_exchange(pt_element oldval, pt_element newval) { + void write(pt_element pte) const { *const_cast<volatile u64*>(&p->x) = pte.x; } + bool compare_exchange(pt_element oldval, pt_element newval) const { std::atomic<u64> *x = reinterpret_cast<std::atomic<u64>*>(&p->x); return x->compare_exchange_strong(oldval.x, newval.x, std::memory_order_relaxed); } - pt_element exchange(pt_element newval) { + pt_element exchange(pt_element newval) const { std::atomic<u64> *x = reinterpret_cast<std::atomic<u64>*>(&p->x); return pt_element(x->exchange(newval.x)); } - hw_ptep at(unsigned idx) { return hw_ptep(p + idx); } + hw_ptep at(unsigned idx) const { return hw_ptep(p + idx); } static hw_ptep force(pt_element* ptep) { return hw_ptep(ptep); } // no longer using this as a page table - pt_element* release() { return p; } + pt_element* release() const { return p; } + bool operator==(const hw_ptep& a) const noexcept { return p == a.p; } private: hw_ptep(pt_element* ptep) : p(ptep) {} pt_element* p; diff --git a/bsd/porting/mmu.cc b/bsd/porting/mmu.cc index 7be239eefa1b3b6bdb8e8827efd5d4c187b73e8d..ec3f18196b510e4f25aae6183d4c0043cffe484b 100644 --- a/bsd/porting/mmu.cc +++ b/bsd/porting/mmu.cc @@ -42,5 +42,5 @@ int vm_throttling_needed(void) void mmu_unmap(void *addr, size_t size) { - mmu::unmap_address(addr, size); + mmu::unmap_address(addr, addr, size); } diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c index b7b3bbb6d933003e82978d2f02dd89ef24ec8225..f0bb64e9c61f47417331cc1aafb43d1c372371a9 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c @@ -980,14 +980,11 @@ xuio_stat_wbuf_nocopy() #ifdef _KERNEL int -dmu_map_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, bool map) +dmu_map_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, unsigned action) { dmu_buf_t **dbp; int err; - struct uio_mapper *uio_map = (struct uio_mapper *)uio; struct iovec *iov; - int tocpy; - int bufoff; int numbufs = 0; // This will acquire a reference both in the dbuf, and in the ARC buffer. @@ -1003,33 +1000,19 @@ dmu_map_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, bool map) dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; - if (map) { - arc_share_buf(dbi->db_buf); - - bufoff = uio->uio_loffset - db->db_offset; - tocpy = (int)MIN(db->db_size - bufoff, size); + iov = uio->uio_iov; + iov->iov_base = dbuf_abuf->b_data; + iov->iov_len = db->db_size; + uio->uio_loffset = uio->uio_loffset - db->db_offset; - uio_map->buffer = dbuf_abuf->b_data; - // FIXME: Should be the ARC size, but that is private. They should be the same. - uio_map->buf_size = db->db_size; - uio_map->buf_off = bufoff; - iov = uio->uio_iov; - iov->iov_base = (char *)dbuf_abuf->b_data; - iov->iov_len = tocpy; - } else { - iov = uio->uio_iov; - // empty iov is a query operation. - if (iov->iov_base) { - assert(iov->iov_base == (char *)dbuf_abuf->b_data); - arc_unshare_buf(dbi->db_buf); - } else { - iov->iov_base = (char *)dbuf_abuf->b_data; - iov->iov_len = db->db_size; - } - } + if (action == ARC_ACTION_HOLD) + arc_share_buf(dbi->db_buf); + else if (action == ARC_ACTION_RELEASE) + arc_unshare_buf(dbi->db_buf); dmu_buf_rele_array(dbp, numbufs, FTAG); - return 0; + + return (0); } int diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h index 7a1afa5e628f0ab70c8bd2203438c85932c4f857..81c51f440ec4b46112c82c4af3e13efebb348389 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h @@ -602,7 +602,7 @@ void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); -int dmu_map_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, bool map); +int dmu_map_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, unsigned action); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index 42cad0f8f569cb0c8846fe5be65edfcb3b2c76dc..7cdf5fe462547b7bdfe65907a8e2353cbb3a36f0 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -636,7 +636,7 @@ out: static int zfs_truncate(struct vnode *vp, off_t new_size); static int -zfs_manage_mapping(vnode_t *vp, struct file* fp, uio_t *uio, bool map) +zfs_arc(vnode_t *vp, struct file* fp, uio_t *uio, unsigned action) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; @@ -682,7 +682,7 @@ zfs_manage_mapping(vnode_t *vp, struct file* fp, uio_t *uio, bool map) nbytes = MIN(nbytes, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); - error = dmu_map_uio(os, zp->z_id, uio, nbytes, map); + error = dmu_map_uio(os, zp->z_id, uio, nbytes, action); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) @@ -696,19 +696,6 @@ zfs_manage_mapping(vnode_t *vp, struct file* fp, uio_t *uio, bool map) return (error); } -static int -zfs_map(vnode_t *vp, struct file* fp, uio_t *uio) -{ - return zfs_manage_mapping(vp, fp, uio, true); -} - - -static int -zfs_unmap(vnode_t *vp, struct file* fp, uio_t *uio) -{ - return zfs_manage_mapping(vp, fp, uio, false); -} - /* * Write the bytes to a file. * @@ -4961,6 +4948,5 @@ struct vnops zfs_vnops = { zfs_inactive, /* inactive */ zfs_truncate, /* truncate */ zfs_link, /* link */ - zfs_map, /* map */ - zfs_unmap, /* unmap */ + zfs_arc, /* arc */ }; diff --git a/build.mk b/build.mk index e2fe2ed487f6d6b766823c3e28fb90fb37c8a0ad..84534bf109dd748c46c9e8caa80a607062762793 100644 --- a/build.mk +++ b/build.mk @@ -622,6 +622,7 @@ objects += core/semaphore.o objects += core/condvar.o objects += core/debug.o objects += core/rcu.o +objects += core/pagecache.o objects += drivers/pci.o objects += core/mempool.o objects += core/alloctracker.o diff --git a/core/mmu.cc b/core/mmu.cc index be0ae941a2ac04117160c6c816cfe2d628aedd43..f4826dfbd9098739fb657963152aabee5d149994 100644 --- a/core/mmu.cc +++ b/core/mmu.cc @@ -121,6 +121,24 @@ phys virt_to_phys(void *virt) return static_cast<char*>(virt) - phys_mem; } +void* mmupage::vaddr() const +{ + return _page; +} + +phys mmupage::paddr() const +{ + if (!_page) { + throw std::exception(); + } + return virt_to_phys(_page); +} + +bool mmupage::cow() const +{ + return _cow; +} + phys allocate_intermediate_level() { phys pt_page = virt_to_phys(memory::alloc_page()); @@ -156,16 +174,6 @@ bool change_perm(hw_ptep ptep, unsigned int perm) return old & ~perm; } -// This is supposed to be lockless, so we need to rely heavily on atomics. The -// reason for this is that this will be called from an invalidation handler, -// which can in turn be called by the filesystem mapper (for instance, if the -// filesystem needs to evict some memory before reading more) -bool clear_present(hw_ptep ptep) -{ - pt_element old = ptep.exchange(make_empty_pte()); - return old.present(); -} - void split_large_page(hw_ptep ptep, unsigned level) { pt_element pte_orig = ptep.read(); @@ -183,11 +191,10 @@ void split_large_page(hw_ptep ptep, unsigned level) } struct page_allocator { - virtual void* alloc(uintptr_t offset) = 0; - virtual void* alloc(size_t size, uintptr_t offset) = 0; - virtual void free(void *addr, uintptr_t offset) = 0; - virtual void free(void *addr, size_t size, uintptr_t offset) = 0; - virtual void set_addr(uintptr_t addr) {} + virtual mmupage alloc(uintptr_t offset, hw_ptep ptep, bool write) = 0; + virtual mmupage alloc(size_t size, uintptr_t offset, hw_ptep ptep, bool write) = 0; + virtual void free(void *addr, uintptr_t offset, hw_ptep ptep) = 0; + virtual void free(void *addr, size_t size, uintptr_t offset, hw_ptep ptep) = 0; virtual void finalize() = 0; virtual ~page_allocator() {} }; @@ -475,41 +482,57 @@ template <account_opt T = account_opt::no> class populate : public vma_operation<allocate_intermediate_opt::yes, skip_empty_opt::no, T> { private: page_allocator* _page_provider; - unsigned int perm; + unsigned int _perm; + bool _write; bool _map_dirty; pt_element dirty(pt_element pte) { pte.set_dirty(_map_dirty); return pte; } + bool skip(pt_element pte) { + if (pte.empty()) { + return false; + } + return !_write || pte.writable(); + } + unsigned int perm(bool cow) { + unsigned int p = _perm; + if (cow) { + p &= ~perm_write; + } + return p; + } public: - populate(page_allocator* pops, unsigned int perm, bool map_dirty = true) : - _page_provider(pops), perm(perm), _map_dirty(map_dirty) { } + populate(page_allocator* pops, unsigned int perm, bool write = false, bool map_dirty = true) : + _page_provider(pops), _perm(perm), _write(write), _map_dirty(map_dirty) { } void small_page(hw_ptep ptep, uintptr_t offset){ - if (!ptep.read().empty()) { + pt_element pte = ptep.read(); + if (skip(pte)) { return; } - phys page = virt_to_phys(_page_provider->alloc(offset)); - if (!ptep.compare_exchange(make_empty_pte(), dirty(make_normal_pte(page, perm)))) { - _page_provider->free(phys_to_virt(page), offset); + mmupage page = _page_provider->alloc(offset, ptep, _write); + if (!ptep.compare_exchange(pte, dirty(make_normal_pte(page.paddr(), perm(page.cow()))))) { + _page_provider->free(page.vaddr(), offset, ptep); } else { this->account(mmu::page_size); } } bool huge_page(hw_ptep ptep, uintptr_t offset){ - auto pte = ptep.read(); - if (!pte.empty()) { + pt_element pte = ptep.read(); + if (skip(pte)) { return true; } - void *vpage = _page_provider->alloc(huge_page_size, offset); - if (!vpage) { - return false; - } - phys page = virt_to_phys(vpage); - if (!ptep.compare_exchange(make_empty_pte(), dirty(make_large_pte(page, perm)))) { - _page_provider->free(phys_to_virt(page), huge_page_size, offset); - } else { - this->account(mmu::huge_page_size); + try { + mmupage page = _page_provider->alloc(huge_page_size, offset, ptep, _write); + + if (!ptep.compare_exchange(pte, dirty(make_large_pte(page.paddr(), perm(page.cow()))))) { + _page_provider->free(page.vaddr(), huge_page_size, offset, ptep); + } else { + this->account(mmu::huge_page_size); + } + } catch(std::exception&) { + return false; } return true; } @@ -518,7 +541,8 @@ public: template <account_opt Account = account_opt::no> class populate_small : public populate<Account> { public: - populate_small(page_allocator* pops, unsigned int perm, bool map_dirty = true) : populate<Account>(pops, perm, map_dirty) { } + populate_small(page_allocator* pops, unsigned int perm, bool write = false, bool map_dirty = true) : + populate<Account>(pops, perm, write, map_dirty) { } bool huge_page(hw_ptep ptep, uintptr_t offset) { assert(0); return false; @@ -534,15 +558,16 @@ struct tlb_gather { void* addr; size_t size; off_t offset; // FIXME: unneeded? + pt_element* ptep; }; page_allocator* page_provider; size_t nr_pages = 0; tlb_page pages[max_pages]; - void push(void* addr, size_t size, off_t offset) { + void push(void* addr, size_t size, off_t offset, hw_ptep ptep) { if (nr_pages == max_pages) { flush(); } - pages[nr_pages++] = { addr, size, offset }; + pages[nr_pages++] = { addr, size, offset, ptep.release() }; } void flush() { if (!nr_pages) { @@ -552,9 +577,9 @@ struct tlb_gather { for (auto i = 0u; i < nr_pages; ++i) { auto&& tp = pages[i]; if (tp.size == page_size) { - page_provider->free(tp.addr, tp.offset); + page_provider->free(tp.addr, tp.offset, hw_ptep::force(tp.ptep)); } else { - page_provider->free(tp.addr, tp.size, tp.offset); + page_provider->free(tp.addr, tp.size, tp.offset, hw_ptep::force(tp.ptep)); } } nr_pages = 0; @@ -577,13 +602,13 @@ public: // not-present may only mean mprotect(PROT_NONE). pt_element pte = ptep.read(); ptep.write(make_empty_pte()); - _tlb_gather.push(phys_to_virt(pte.addr(false)), page_size, offset); + _tlb_gather.push(phys_to_virt(pte.addr(false)), page_size, offset, ptep); this->account(mmu::page_size); } bool huge_page(hw_ptep ptep, uintptr_t offset) { pt_element pte = ptep.read(); ptep.write(make_empty_pte()); - _tlb_gather.push(phys_to_virt(pte.addr(true)), huge_page_size, offset); + _tlb_gather.push(phys_to_virt(pte.addr(true)), huge_page_size, offset, ptep); this->account(mmu::huge_page_size); return true; } @@ -612,20 +637,6 @@ public: bool tlb_flush_needed(void) {return do_flush;} }; -class page_out: public vma_operation<allocate_intermediate_opt::no, skip_empty_opt::yes> { -private: - bool do_flush = false; -public: - page_out() { } - void small_page(hw_ptep ptep, uintptr_t offset) { - do_flush |= clear_present(ptep); - } - bool huge_page(hw_ptep ptep, uintptr_t offset) { - abort(); - } - bool tlb_flush_needed(void) {return do_flush;} -}; - class count_maps: public vma_operation<allocate_intermediate_opt::no, skip_empty_opt::yes, account_opt::yes> { @@ -885,16 +896,16 @@ private: return addr; } public: - virtual void* alloc(uintptr_t offset) override { + virtual mmupage alloc(uintptr_t offset, hw_ptep ptep, bool write) override { return fill(memory::alloc_page(), offset, page_size); } - virtual void* alloc(size_t size, uintptr_t offset) override { + virtual mmupage alloc(size_t size, uintptr_t offset, hw_ptep ptep, bool write) override { return fill(memory::alloc_huge_page(size), offset, size); } - virtual void free(void *addr, uintptr_t offset) override { + virtual void free(void *addr, uintptr_t offset, hw_ptep ptep) override { return memory::free_page(addr); } - virtual void free(void *addr, size_t size, uintptr_t offset) override { + virtual void free(void *addr, size_t size, uintptr_t offset, hw_ptep ptep) override { return memory::free_huge_page(addr, size); } virtual void finalize() override { @@ -941,28 +952,24 @@ public: class map_file_page_mmap : public page_allocator { private: file* _file; - off_t _map_offset; - uintptr_t _start = 0; + off_t _foffset; + bool _shared; public: - map_file_page_mmap(file *file, off_t off) : _file(file), _map_offset(off) {} + map_file_page_mmap(file *file, off_t off, bool shared) : _file(file), _foffset(off), _shared(shared) {} virtual ~map_file_page_mmap() {}; - virtual void* alloc(uintptr_t offset) override { - return _file->get_page(_start + offset, offset + _map_offset, page_size); + virtual mmupage alloc(uintptr_t offset, hw_ptep ptep, bool write) override { + return alloc(page_size, offset, ptep, write); } - virtual void* alloc(size_t size, uintptr_t offset) override { - return _file->get_page(_start + offset, offset + _map_offset, size); + virtual mmupage alloc(size_t size, uintptr_t offset, hw_ptep ptep, bool write) override { + return _file->get_page(offset + _foffset, size, ptep, write, _shared); } - virtual void free(void *addr, uintptr_t offset) override { - _file->put_page(addr, _start + offset, offset + _map_offset, page_size); + virtual void free(void *addr, uintptr_t offset, hw_ptep ptep) override { + free(addr, page_size, offset, ptep); } - virtual void free(void *addr, size_t size, uintptr_t offset) override { - _file->put_page(addr, _start + offset, offset + _map_offset, size); - } - - virtual void set_addr(uintptr_t addr) override { - _start = addr; + virtual void free(void *addr, size_t size, uintptr_t offset, hw_ptep ptep) override { + _file->put_page(addr, offset + _foffset, size, ptep); } void finalize() { @@ -970,11 +977,10 @@ public: }; // In the general case, we expect only one element in the list. -static std::unordered_multimap<void *, uintptr_t> shared_fs_maps; +static std::unordered_multimap<void *, hw_ptep> shared_fs_maps; // We need to reference count the buffer, but first we need to store the // buffer somewhere we can find static std::unordered_map<void *, unsigned int> shared_fs_buf_refcnt; - // Can't use the vma_list_mutex, because if we do, we can have a deadlock where // we call into the filesystem to read data with the vma_list_mutex held - because // we do that for complex operate operations, and if the filesystem decides to evict @@ -985,45 +991,60 @@ static void fs_buf_get(void *buf_addr) { auto b = shared_fs_buf_refcnt.find(buf_addr); if (b == shared_fs_buf_refcnt.end()) { - shared_fs_buf_refcnt.insert(std::make_pair(buf_addr, 1)); + shared_fs_buf_refcnt.emplace(buf_addr, 1); return; } b->second++; } -static bool fs_buf_put(void *buf_addr) +static bool fs_buf_put(void *buf_addr, unsigned dec = 1) { auto b = shared_fs_buf_refcnt.find(buf_addr); assert(b != shared_fs_buf_refcnt.end()); - auto old = --b->second; - if (old == 0) { + assert(b->second >= dec); + b->second -= dec; + if (b->second == 0) { shared_fs_buf_refcnt.erase(buf_addr); return true; } return false; } -void add_mapping(void *buf_addr, uintptr_t off, uintptr_t vaddr) +void add_mapping(void *buf_addr, void *page, hw_ptep ptep) { WITH_LOCK(shared_fs_mutex) { - shared_fs_maps.insert(std::make_pair(buf_addr + off, vaddr)); + shared_fs_maps.emplace(page, ptep); fs_buf_get(buf_addr); } } -bool remove_mapping(void *buf_addr, void *paddr, uintptr_t addr) +bool remove_mapping(void *buf_addr, void *paddr, hw_ptep ptep) { WITH_LOCK(shared_fs_mutex) { auto buf = shared_fs_maps.equal_range(paddr); for (auto it = buf.first; it != buf.second; it++) { - auto v = (*it).second; - if (v == addr) { + auto stored = (*it).second; + if (stored == ptep) { shared_fs_maps.erase(it); - break; + return fs_buf_put(buf_addr); } } - return fs_buf_put(buf_addr); } + return false; +} + +bool lookup_mapping(void *paddr, hw_ptep ptep) +{ + WITH_LOCK(shared_fs_mutex) { + auto buf = shared_fs_maps.equal_range(paddr); + for (auto it = buf.first; it != buf.second; it++) { + auto stored = (*it).second; + if (stored == ptep) { + return true; + } + } + } + return false; } uintptr_t allocate(vma *v, uintptr_t start, size_t size, bool search) @@ -1071,34 +1092,43 @@ void vcleanup(void* addr, size_t size) } template<account_opt Account = account_opt::no> -ulong populate_vma(vma *vma, void *v, size_t size) +ulong populate_vma(vma *vma, void *v, size_t size, bool write = false) { page_allocator *map = vma->page_ops(); auto total = vma->has_flags(mmap_small) ? - vma->operate_range(populate_small<Account>(map, vma->perm(), vma->map_dirty()), v, size) : - vma->operate_range(populate<Account>(map, vma->perm(), vma->map_dirty()), v, size); + vma->operate_range(populate_small<Account>(map, vma->perm(), write, vma->map_dirty()), v, size) : + vma->operate_range(populate<Account>(map, vma->perm(), write, vma->map_dirty()), v, size); map->finalize(); return total; } -TRACEPOINT(trace_mmu_invalidate, "addr=%p, vaddr=%p", void *, uintptr_t); -void unmap_address(void *addr, size_t size) +void clear_pte(hw_ptep ptep) { + ptep.write(make_empty_pte()); +} + +void clear_pte(std::pair<void* const, hw_ptep>& pair) +{ + clear_pte(pair.second); +} + +bool unmap_address(void *buf_addr, void *addr, size_t size) +{ + bool last; + unsigned refs = 0; size = align_up(size, page_size); WITH_LOCK(shared_fs_mutex) { - shared_fs_buf_refcnt.erase(addr); for (uintptr_t a = reinterpret_cast<uintptr_t>(addr); size; a += page_size, size -= page_size) { addr = reinterpret_cast<void*>(a); auto buf = shared_fs_maps.equal_range(addr); - for (auto it = buf.first; it != buf.second; it++) { - auto vaddr = (*it).second; - trace_mmu_invalidate(addr, vaddr); - operate_range(page_out(), (void *)vaddr, page_size); - } + refs += clear_ptes(buf.first, buf.second); shared_fs_maps.erase(addr); } + last = refs ? fs_buf_put(buf_addr, refs) : false; } + tlb_flush(); + return last; } void* map_anon(const void* addr, size_t size, unsigned flags, unsigned perm) @@ -1117,12 +1147,12 @@ void* map_anon(const void* addr, size_t size, unsigned flags, unsigned perm) std::unique_ptr<file_vma> default_file_mmap(file* file, addr_range range, unsigned flags, unsigned perm, off_t offset) { - return std::unique_ptr<file_vma>(new file_vma(range, perm, file, offset, flags & mmu::mmap_shared, new map_file_page_read(file, offset))); + return std::unique_ptr<file_vma>(new file_vma(range, perm, flags, file, offset, new map_file_page_read(file, offset))); } std::unique_ptr<file_vma> map_file_mmap(file* file, addr_range range, unsigned flags, unsigned perm, off_t offset) { - return std::unique_ptr<file_vma>(new file_vma(range, perm, file, offset, flags & mmu::mmap_shared, new map_file_page_mmap(file, offset))); + return std::unique_ptr<file_vma>(new file_vma(range, perm, flags, file, offset, new map_file_page_mmap(file, offset, flags & mmap_shared))); } void* map_file(const void* addr, size_t size, unsigned flags, unsigned perm, @@ -1251,7 +1281,6 @@ vma::~vma() void vma::set(uintptr_t start, uintptr_t end) { _range = addr_range(align_down(start), align_up(end)); - _page_ops->set_addr(start); } void vma::protect(unsigned perm) @@ -1328,7 +1357,7 @@ void vma::fault(uintptr_t addr, exception_frame *ef) size = page_size; } - auto total = populate_vma<account_opt::yes>(this, (void*)addr, size); + auto total = populate_vma<account_opt::yes>(this, (void*)addr, size, ef->error_code & page_fault_write); if (_flags & mmap_jvm_heap) { memory::stats::on_jvm_heap_alloc(total); @@ -1561,11 +1590,10 @@ ulong map_jvm(unsigned char* jvm_addr, size_t size, size_t align, balloon_ptr b) return 0; } -file_vma::file_vma(addr_range range, unsigned perm, fileref file, f_offset offset, bool shared, page_allocator* page_ops) - : vma(range, perm, shared ? mmap_small : 0, !shared, page_ops) +file_vma::file_vma(addr_range range, unsigned perm, unsigned flags, fileref file, f_offset offset, page_allocator* page_ops) + : vma(range, perm, flags | mmap_small, !(flags & mmap_shared), page_ops) , _file(file) , _offset(offset) - , _shared(shared) { int err = validate_perm(perm); @@ -1623,7 +1651,7 @@ private: error file_vma::sync(uintptr_t start, uintptr_t end) { - if (!_shared) + if (!has_flags(mmap_shared)) return make_error(ENOMEM); start = std::max(start, _range.start()); end = std::min(end, _range.end()); @@ -1644,7 +1672,7 @@ int file_vma::validate_perm(unsigned perm) return EACCES; } if (perm & perm_write) { - if (_shared && !(_file->f_flags & FWRITE)) { + if (has_flags(mmap_shared) && !(_file->f_flags & FWRITE)) { return EACCES; } } @@ -1666,7 +1694,7 @@ std::unique_ptr<file_vma> shm_file::mmap(addr_range range, unsigned flags, unsig return map_file_mmap(this, range, flags, perm, offset); } -void* shm_file::get_page(uintptr_t start, uintptr_t offset, size_t size) +mmupage shm_file::get_page(uintptr_t offset, size_t size, hw_ptep ptep, bool write, bool shared) { uintptr_t hp_off = ::align_down(offset, huge_page_size); void *addr; @@ -1684,7 +1712,7 @@ void* shm_file::get_page(uintptr_t start, uintptr_t offset, size_t size) return static_cast<char*>(addr) + offset - hp_off; } -void shm_file::put_page(void *addr, uintptr_t start, uintptr_t offset, size_t size) {} +void shm_file::put_page(void *addr, uintptr_t offset, size_t size, hw_ptep ptep) {} shm_file::shm_file(size_t size, int flags) : special_file(flags, DTYPE_UNSPEC), _size(size) {} diff --git a/core/pagecache.cc b/core/pagecache.cc new file mode 100644 index 0000000000000000000000000000000000000000..662b98d68097329feaf51a380a7db241d0e59151 --- /dev/null +++ b/core/pagecache.cc @@ -0,0 +1,229 @@ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + + +#include <unordered_map> +#include <unordered_set> +#include <deque> +#include <osv/pagecache.hh> +#include <osv/mempool.hh> +#include <fs/vfs/vfs.h> + +namespace pagecache { +struct hashkey { + dev_t dev; + ino_t ino; + off_t offset; + bool operator==(const hashkey& a) const noexcept { + return (dev == a.dev) && (ino == a.ino) && (offset == a.offset); + } +}; +} + +namespace std { +template<> +struct hash<pagecache::hashkey> { + size_t operator()(const pagecache::hashkey key) const noexcept { + hash<uint64_t> h; + return h(key.dev) ^ h(key.ino) ^ h(key.offset); + } +}; + +template<> struct hash<mmu::hw_ptep> { + size_t operator()(const mmu::hw_ptep& ptep) const noexcept { + hash<const mmu::pt_element*> h; + return h(ptep.release()); + } +}; +} + +namespace pagecache { + +class cached_page { +private: + const hashkey _key; + struct dentry* _dp; + void* _page; + std::unordered_set<mmu::hw_ptep> _ptes; // set of pointers to ptes that map the page +public: + cached_page(hashkey key, vfs_file* fp) : _key(key) { + _dp = fp->f_dentry; + dref(_dp); + _page = memory::alloc_page(); + } + ~cached_page() { + if (_page) { + writeback(); + memory::free_page(_page); + drele(_dp); + } + } + + int writeback() + { + struct vnode *vp = _dp->d_vnode; + int error; + struct iovec iov {_page, mmu::page_size}; + struct uio uio {&iov, 1, _key.offset, mmu::page_size, UIO_WRITE}; + + vn_lock(vp); + error = VOP_WRITE(vp, &uio, 0); + vn_unlock(vp); + + return error; + } + + void map(mmu::hw_ptep ptep) { + _ptes.emplace(ptep); + } + void unmap(mmu::hw_ptep ptep) { + _ptes.erase(ptep); + } + void* addr() { + return _page; + } + void flush() { + mmu::clear_ptes(_ptes.begin(), _ptes.end()); + } + const hashkey& key() { + return _key; + } + void* release() { // called to demote a page from cache page to anonymous + assert(_ptes.size() == 0); + void *p = _page; + _page = nullptr; + drele(_dp); + return p; + } +}; + +constexpr unsigned lru_max_length = 100; +constexpr unsigned lru_free_count = 20; + +static mutex lock; +static std::unordered_map<hashkey, cached_page*> cache; +static std::deque<cached_page*> lru; + +static std::unique_ptr<cached_page> create_write_cached_page(vfs_file* fp, hashkey& key) +{ + size_t bytes; + cached_page* cp = new cached_page(key, fp); + struct iovec iov {cp->addr(), mmu::page_size}; + + sys_read(fp, &iov, 1, key.offset, &bytes); + return std::unique_ptr<cached_page>(cp); +} + +static void insert(cached_page* cp) { + static cached_page* tofree[lru_free_count]; + cache.emplace(cp->key(), cp); + lru.push_front(cp); + + if (lru.size() > lru_max_length) { + for (unsigned i = 0; i < lru_free_count; i++) { + cached_page *p = lru.back(); + lru.pop_back(); + cache.erase(p->key()); + p->flush(); + tofree[i] = p; + } + mmu::tlb_flush(); + for (auto p: tofree) { + delete p; + } + } +} + +static cached_page *find_in_write_cache(hashkey& key) +{ + auto cpi = cache.find(key); + + if (cpi == cache.end()) { + return nullptr; + } else { + return cpi->second; + } +} + +mmu::mmupage get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, bool write, bool shared) +{ + void *start, *page; + size_t len; + struct stat st; + fp->stat(&st); + hashkey key {st.st_dev, st.st_ino, offset}; + SCOPE_LOCK(lock); + cached_page* cp = find_in_write_cache(key); + + if (write) { + if (!cp) { + auto newcp = create_write_cached_page(fp, key); + // FIXME: if page is not in ARC it will be read here, + // FIXME: we need a function that return NULL if page is not in ARC + fp->get_arcbuf(offset, ARC_ACTION_QUERY, &start, &len, &page); + if (shared) { + // write fault into shared mapping, there page is not in write cache yet, add it. + cp = newcp.release(); + insert(cp); + // page is moved from ARC to write cache + // remove any mapping to ARC page + // FIXME: if pte we are changing is the only one, no need to unmap + if (mmu::unmap_address(start, page, mmu::page_size)) { + fp->get_arcbuf(offset, ARC_ACTION_RELEASE, &start, &len, &page); + } + } else { + // remove mapping to ARC page if exists + if (mmu::remove_mapping(start, page, ptep)) { + fp->get_arcbuf(offset, ARC_ACTION_RELEASE, &start, &len, &page); + } + // cow of private page from ARC + return newcp->release(); + } + } else if (!shared) { + // cow of private page from write cache + page = memory::alloc_page(); + memcpy(page, cp->addr(), mmu::page_size); + return page; + } + } else if (!cp) { + // read fault and page is not in write cache yet, return one from ARC, mark it cow + fp->get_arcbuf(offset, ARC_ACTION_HOLD, &start, &len, &page); + mmu::add_mapping(start, page, ptep); + return mmu::mmupage(page, true); + } + + cp->map(ptep); + return cp->addr(); +} + +void release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep ptep) +{ + struct stat st; + fp->stat(&st); + hashkey key {st.st_dev, st.st_ino, offset}; + SCOPE_LOCK(lock); + cached_page *cp = find_in_write_cache(key); + + // page is either in ARC cache or write cache or private page + if (cp && cp->addr() == addr) { + // page is in write cache + cp->unmap(ptep); + } else if (mmu::lookup_mapping(addr, ptep)) { + // page is in ARC + void *start, *page; + size_t len; + fp->get_arcbuf(offset, ARC_ACTION_QUERY, &start, &len, &page); + assert (addr == page); + if (mmu::remove_mapping(start, page, ptep)) { + fp->get_arcbuf(offset, ARC_ACTION_RELEASE, &start, &len, &page); + } + } else { + // private page + memory::free_page(addr); + } +} +} diff --git a/fs/vfs/vfs_fops.cc b/fs/vfs/vfs_fops.cc index 82dbf5b017598eec26de84b8e0e0d70728890455..4878fdd326f04073d58d8221cc0d6769a472ee89 100644 --- a/fs/vfs/vfs_fops.cc +++ b/fs/vfs/vfs_fops.cc @@ -13,6 +13,8 @@ #include <fs/vfs/vfs.h> #include <osv/vfs_file.hh> #include <osv/mmu.hh> +#include "arch-mmu.hh" +#include <osv/pagecache.hh> vfs_file::vfs_file(unsigned flags) : file(flags, DTYPE_VNODE) @@ -137,82 +139,49 @@ int vfs_file::chmod(mode_t mode) abort(); } -// Locking: vn_lock will call into the filesystem, and that can trigger an -// eviction that will hold the mmu-side lock that protects the mappings -// Always follow that order. We however can't just get rid of the mmu-side lock, -// because not all invalidations will be synchronous. -void* vfs_file::get_page(uintptr_t start, uintptr_t off, size_t size) +mmu::mmupage vfs_file::get_page(uintptr_t off, size_t size, mmu::hw_ptep ptep, bool write, bool shared) { - assert(size == mmu::page_size); - - auto fp = this; - struct vnode *vp = fp->f_dentry->d_vnode; - - iovec io; - io.iov_base = nullptr; - io.iov_len = 0; - - uio_mapper map_data; - uio *data = &map_data.uio; - - data->uio_iov = &io; - data->uio_iovcnt = 1; - data->uio_offset = off_t(off); - // FIXME: If the buffer can hold, remap other pages as well, up to the - // buffer size. However, this would require heavy changes in the fill - // and map code. Let's try it later. - data->uio_resid = mmu::page_size; - data->uio_rw = UIO_READ; - map_data.buffer = nullptr; - - vn_lock(vp); - assert(VOP_MAP(vp, fp, data) == 0); - vn_unlock(vp); - - mmu::add_mapping(io.iov_base, map_data.buf_off, start); - assert((reinterpret_cast<uintptr_t>(io.iov_base) & (mmu::page_size - 1)) == 0); - return io.iov_base + map_data.buf_off; + return pagecache::get(this, off, ptep, write, shared); } -void vfs_file::put_page(void *addr, uintptr_t start, uintptr_t off, size_t size) +void vfs_file::put_page(void *addr, uintptr_t off, size_t size, mmu::hw_ptep ptep) { - assert(size == mmu::page_size); - - auto fp = this; - struct vnode *vp = fp->f_dentry->d_vnode; - - iovec io; - io.iov_base = nullptr; - io.iov_len = 0; - - uio data; - data.uio_iov = &io; - data.uio_iovcnt = 0; - data.uio_offset = off_t(off); - data.uio_resid = mmu::page_size; - data.uio_rw = UIO_READ; - - vn_lock(vp); - // This first call will only query the buffer address. The result will be - // in uio_iov.iov_base. If this is the last reference to the buffer, then - // we call it again, with the iov update. (automatically done after this - // call) Usually it won't be, so we'll do only one call. - assert(VOP_UNMAP(vp, fp, &data) == 0); - if (mmu::remove_mapping(io.iov_base, addr, start)) { - assert(VOP_UNMAP(vp, fp, &data) == 0); - } - vn_unlock(vp); + pagecache::release(this, addr, off, ptep); +} +// Locking: vn_lock will call into the filesystem, and that can trigger an +// eviction that will hold the mmu-side lock that protects the mappings +// Always follow that order. We however can't just get rid of the mmu-side lock, +// because not all invalidations will be synchronous. +void vfs_file::get_arcbuf(uintptr_t offset, unsigned action, void** start, size_t* len, void** page) +{ + struct vnode *vp = f_dentry->d_vnode; + + iovec io; + io.iov_base = nullptr; + io.iov_len = 0; + + uio data; + data.uio_iov = &io; + data.uio_iovcnt = 1; + data.uio_offset = off_t(offset); + data.uio_resid = mmu::page_size; + data.uio_rw = UIO_READ; + + vn_lock(vp); + assert(VOP_CACHE(vp, this, &data, action) == 0); + vn_unlock(vp); + *start = io.iov_base; + *len = io.iov_len; + *page = static_cast<char*>(io.iov_base) + data.uio_offset; } std::unique_ptr<mmu::file_vma> vfs_file::mmap(addr_range range, unsigned flags, unsigned perm, off_t offset) { auto fp = this; struct vnode *vp = fp->f_dentry->d_vnode; - if ((perm & mmu::perm_write) || (!vp->v_op->vop_map) || (vp->v_size < (off_t)mmu::page_size)) { + if (!vp->v_op->vop_cache || (vp->v_size < (off_t)mmu::page_size)) { return mmu::default_file_mmap(this, range, flags, perm, offset); } - // Don't know what to do if we have one but not the other - assert(vp->v_op->vop_unmap); return mmu::map_file_mmap(this, range, flags, perm, offset); } diff --git a/include/osv/file.h b/include/osv/file.h index 9d3bcee563d53323594c6310d2b429127ae82a7d..76ddc4a43d4e97a93d965bf6968eb8630cfafeab 100755 --- a/include/osv/file.h +++ b/include/osv/file.h @@ -49,6 +49,7 @@ #include <osv/addr_range.hh> #include <osv/rcu.hh> #include <osv/error.h> +#include "arch-mmu.hh" #endif @@ -96,8 +97,8 @@ struct file { virtual std::unique_ptr<mmu::file_vma> mmap(addr_range range, unsigned flags, unsigned perm, off_t offset) { throw make_error(ENODEV); } - virtual void* get_page(uintptr_t start, uintptr_t offset, size_t size) { throw make_error(ENOSYS);} - virtual void put_page(void *addr, uintptr_t start, uintptr_t offset, size_t size) { throw make_error(ENOSYS);} + virtual mmu::mmupage get_page(uintptr_t offset, size_t size, mmu::hw_ptep ptep, bool write, bool shared) { throw make_error(ENOSYS); } + virtual void put_page(void *addr, uintptr_t offset, size_t size, mmu::hw_ptep ptep) { throw make_error(ENOSYS); } int f_flags; /* open flags */ int f_count; /* reference count, see below */ diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh new file mode 100644 index 0000000000000000000000000000000000000000..6d14b42dd52119c89a6afe875d6247dfd096790a --- /dev/null +++ b/include/osv/mmu-defs.hh @@ -0,0 +1,68 @@ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#ifndef MMU_DEFS_HH +#define MMU_DEFS_HH + +#include <stdint.h> + +namespace mmu { + +constexpr uintptr_t page_size = 4096; +constexpr int page_size_shift = 12; // log2(page_size) + +constexpr int pte_per_page = 512; +constexpr int pte_per_page_shift = 9; // log2(pte_per_page) + +constexpr uintptr_t huge_page_size = mmu::page_size*pte_per_page; // 2 MB + +typedef uint64_t f_offset; +typedef uint64_t phys; + +static char* const phys_mem = reinterpret_cast<char*>(0xffffc00000000000); +// area for debug allocations: +static char* const debug_base = reinterpret_cast<char*>(0xffffe00000000000); + +enum { + perm_read = 1, + perm_write = 2, + perm_exec = 4, + perm_rx = perm_read | perm_exec, + perm_rw = perm_read | perm_write, + perm_rwx = perm_read | perm_write | perm_exec, +}; + +enum { + page_fault_prot = 1ul << 0, + page_fault_write = 1ul << 1, + page_fault_user = 1ul << 2, + page_fault_rsvd = 1ul << 3, + page_fault_insn = 1ul << 4, +}; + +enum { + mmap_fixed = 1ul << 0, + mmap_populate = 1ul << 1, + mmap_shared = 1ul << 2, + mmap_uninitialized = 1ul << 3, + mmap_jvm_heap = 1ul << 4, + mmap_small = 1ul << 5, + mmap_jvm_balloon = 1ul << 6, +}; + +class mmupage { + void* _page; + bool _cow; +public: + mmupage(void *page, bool cow = false) : _page(page), _cow(cow) {} + void* vaddr() const; + phys paddr() const; + bool cow() const; +}; + +} +#endif diff --git a/include/osv/mmu.hh b/include/osv/mmu.hh index 598240fe7dc5cd2e6fa113f7f2152e8c6bade520..a4b19a27b1774aa060d2c2bfce5191bcf16c6c2b 100644 --- a/include/osv/mmu.hh +++ b/include/osv/mmu.hh @@ -17,6 +17,8 @@ #include <osv/addr_range.hh> #include <unordered_map> #include <memory> +#include <osv/mmu-defs.hh> +#include "arch-mmu.hh" struct exception_frame; class balloon; @@ -27,52 +29,11 @@ typedef std::shared_ptr<balloon> balloon_ptr; */ namespace mmu { -constexpr uintptr_t page_size = 4096; -constexpr int page_size_shift = 12; // log2(page_size) - -constexpr int pte_per_page = 512; -constexpr int pte_per_page_shift = 9; // log2(pte_per_page) - -constexpr uintptr_t huge_page_size = mmu::page_size*pte_per_page; // 2 MB - -typedef uint64_t f_offset; - -static char* const phys_mem = reinterpret_cast<char*>(0xffffc00000000000); -// area for debug allocations: -static char* const debug_base = reinterpret_cast<char*>(0xffffe00000000000); - constexpr inline unsigned pt_index(void *virt, unsigned level) { return (reinterpret_cast<ulong>(virt) >> (page_size_shift + level * pte_per_page_shift)) & (pte_per_page - 1); } -enum { - perm_read = 1, - perm_write = 2, - perm_exec = 4, - perm_rx = perm_read | perm_exec, - perm_rw = perm_read | perm_write, - perm_rwx = perm_read | perm_write | perm_exec, -}; - -enum { - page_fault_prot = 1ul << 0, - page_fault_write = 1ul << 1, - page_fault_user = 1ul << 2, - page_fault_rsvd = 1ul << 3, - page_fault_insn = 1ul << 4, -}; - -enum { - mmap_fixed = 1ul << 0, - mmap_populate = 1ul << 1, - mmap_shared = 1ul << 2, - mmap_uninitialized = 1ul << 3, - mmap_jvm_heap = 1ul << 4, - mmap_small = 1ul << 5, - mmap_jvm_balloon = 1ul << 6, -}; - struct page_allocator; class vma { @@ -129,7 +90,7 @@ public: class file_vma : public vma { public: - file_vma(addr_range range, unsigned perm, fileref file, f_offset offset, bool shared, page_allocator *page_ops); + file_vma(addr_range range, unsigned perm, unsigned flags, fileref file, f_offset offset, page_allocator *page_ops); ~file_vma(); virtual void split(uintptr_t edge) override; virtual error sync(uintptr_t start, uintptr_t end) override; @@ -138,7 +99,6 @@ private: f_offset offset(uintptr_t addr); fileref _file; f_offset _offset; - bool _shared; }; ulong map_jvm(unsigned char* addr, size_t size, size_t align, balloon_ptr b); @@ -181,8 +141,8 @@ public: virtual int stat(struct stat* buf) override; virtual int close() override; virtual std::unique_ptr<file_vma> mmap(addr_range range, unsigned flags, unsigned perm, off_t offset) override; - virtual void* get_page(uintptr_t start, uintptr_t offset, size_t size) override; - virtual void put_page(void *addr, uintptr_t start, uintptr_t offset, size_t size) override; + virtual mmupage get_page(uintptr_t offset, size_t size, hw_ptep ptep, bool write, bool shared) override; + virtual void put_page(void *addr, uintptr_t offset, size_t size, hw_ptep ptep) override; }; void* map_file(const void* addr, size_t size, unsigned flags, unsigned perm, @@ -200,11 +160,14 @@ bool isreadable(void *addr, size_t size); std::unique_ptr<file_vma> default_file_mmap(file* file, addr_range range, unsigned flags, unsigned perm, off_t offset); std::unique_ptr<file_vma> map_file_mmap(file* file, addr_range range, unsigned flags, unsigned perm, off_t offset); -void unmap_address(void *addr, size_t size); -void add_mapping(void *buf_addr, uintptr_t offset, uintptr_t vaddr); -bool remove_mapping(void *buf_addr, void *paddr, uintptr_t addr); +bool unmap_address(void* buf, void *addr, size_t size); +void add_mapping(void *buf_addr, void* addr, hw_ptep ptep); +bool remove_mapping(void *buf_addr, void *paddr, hw_ptep ptep); +bool lookup_mapping(void *paddr, hw_ptep ptep); +void tlb_flush(); +void clear_pte(hw_ptep ptep); +void clear_pte(std::pair<void* const, hw_ptep>& pair); -typedef uint64_t phys; phys virt_to_phys(void *virt); void* phys_to_virt(phys pa); @@ -240,6 +203,17 @@ void vm_fault(uintptr_t addr, exception_frame* ef); std::string procfs_maps(); +template<typename I> +unsigned clear_ptes(I start, I end) +{ + unsigned i = 0; + for (auto it = start; it != end; it++) { + clear_pte(*it); + i++; + } + return i; +} + } #endif diff --git a/include/osv/pagecache.hh b/include/osv/pagecache.hh new file mode 100644 index 0000000000000000000000000000000000000000..df8d2e7761febfb4bfc12ddb75526bde4f377e0c --- /dev/null +++ b/include/osv/pagecache.hh @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2014 Cloudius Systems, Ltd. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include <osv/file.h> +#include <osv/vfs_file.hh> +#include <osv/mmu.hh> +#include "arch-mmu.hh" + +namespace pagecache { + +mmu::mmupage get(vfs_file* fp, off_t offset, mmu::hw_ptep ptep, bool write, bool shared); +void release(vfs_file* fp, void *addr, off_t offset, mmu::hw_ptep ptep); + +} diff --git a/include/osv/uio.h b/include/osv/uio.h index bd9ff17b0223a6c3ded482b257746a733f7b05ef..b57b81e5139a467c9ed96d95c0bf0f27bc75e190 100644 --- a/include/osv/uio.h +++ b/include/osv/uio.h @@ -60,16 +60,6 @@ struct uio { enum uio_rw uio_rw; /* operation */ }; -// This layout support only one buffer per uio, which means that it will -// only work for a iovcnt of 1. We can extend later if needed, but because we -// are reading it page by page, it should be fine for now. -struct uio_mapper { - struct uio uio; - size_t buf_size; - size_t buf_off; - void *buffer; -}; - int copyinuio(struct iovec *iovp, u_int iovcnt, struct uio **uiop); int uiomove(void *cp, int n, struct uio *uio); diff --git a/include/osv/vfs_file.hh b/include/osv/vfs_file.hh index 0f107d799faf84d0683bc04311ea7d5747dd3b8b..2f4fe007bc82eacee99e695282b4abd61854bfc4 100644 --- a/include/osv/vfs_file.hh +++ b/include/osv/vfs_file.hh @@ -22,8 +22,9 @@ public: virtual int close() override; virtual int chmod(mode_t mode) override; virtual std::unique_ptr<mmu::file_vma> mmap(addr_range range, unsigned flags, unsigned perm, off_t offset) override; - virtual void* get_page(uintptr_t start, uintptr_t offset, size_t size); - virtual void put_page(void *addr, uintptr_t start, uintptr_t offset, size_t size); + virtual mmu::mmupage get_page(uintptr_t offset, size_t size, mmu::hw_ptep ptep, bool write, bool shared); + virtual void put_page(void *addr, uintptr_t offset, size_t size, mmu::hw_ptep ptep); + void get_arcbuf(uintptr_t offset, unsigned action, void** start, size_t* len, void** page); }; #endif /* VFS_FILE_HH_ */ diff --git a/include/osv/vnode.h b/include/osv/vnode.h index dc01d5803fd935850a2c5f4b07a1030ec734a0ec..c5ed0449403be76c1d2185f86bf1b352750fa39f 100755 --- a/include/osv/vnode.h +++ b/include/osv/vnode.h @@ -118,6 +118,12 @@ struct vattr { #define IO_APPEND 0x0001 #define IO_SYNC 0x0002 +/* + * ARC actions + */ +#define ARC_ACTION_QUERY 0 +#define ARC_ACTION_HOLD 1 +#define ARC_ACTION_RELEASE 2 typedef int (*vnop_open_t) (struct file *); typedef int (*vnop_close_t) (struct vnode *, struct file *); @@ -139,8 +145,7 @@ typedef int (*vnop_setattr_t) (struct vnode *, struct vattr *); typedef int (*vnop_inactive_t) (struct vnode *); typedef int (*vnop_truncate_t) (struct vnode *, off_t); typedef int (*vnop_link_t) (struct vnode *, struct vnode *, char *); -typedef int (*vnop_map_t) (struct vnode *, struct file *, struct uio *); -typedef int (*vnop_unmap_t) (struct vnode *, struct file *, struct uio *); +typedef int (*vnop_cache_t) (struct vnode *, struct file *, struct uio *, unsigned action); /* * vnode operations @@ -165,8 +170,7 @@ struct vnops { vnop_inactive_t vop_inactive; vnop_truncate_t vop_truncate; vnop_link_t vop_link; - vnop_map_t vop_map; - vnop_unmap_t vop_unmap; + vnop_cache_t vop_cache; }; /* @@ -175,8 +179,7 @@ struct vnops { #define VOP_OPEN(VP, FP) ((VP)->v_op->vop_open)(FP) #define VOP_CLOSE(VP, FP) ((VP)->v_op->vop_close)(VP, FP) #define VOP_READ(VP, FP, U, F) ((VP)->v_op->vop_read)(VP, FP, U, F) -#define VOP_MAP(VP, FP, U) ((VP)->v_op->vop_map)(VP, FP, U) -#define VOP_UNMAP(VP, FP, U) ((VP)->v_op->vop_unmap)(VP, FP, U) +#define VOP_CACHE(VP, FP, U, A) ((VP)->v_op->vop_cache)(VP, FP, U, A) #define VOP_WRITE(VP, U, F) ((VP)->v_op->vop_write)(VP, U, F) #define VOP_SEEK(VP, FP, OLD, NEW) ((VP)->v_op->vop_seek)(VP, FP, OLD, NEW) #define VOP_IOCTL(VP, FP, C, A) ((VP)->v_op->vop_ioctl)(VP, FP, C, A)