Skip to content
Snippets Groups Projects
mmu.cc 28.3 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * Copyright (C) 2013 Cloudius Systems, Ltd.
     *
     * This work is open source software, licensed under the terms of the
     * BSD license as described in the LICENSE file in the top-level directory.
     */
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    #include "mmu.hh"
    
    #include "mempool.hh"
    
    #include "processor.hh"
    
    #include "debug.hh"
    
    #include "exceptions.hh"
    
    Avi Kivity's avatar
    Avi Kivity committed
    #include <boost/format.hpp>
    
    #include <string.h>
    
    #include "libc/signal.hh"
    
    #include "align.hh"
    
    #include "interrupt.hh"
    
    #include "prio.hh"
    
    #include <safe-ptr.hh>
    
    #include "fs/vfs/vfs.h"
    #include <osv/error.h>
    
    #include <osv/trace.hh>
    
    #include "arch-mmu.hh"
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    extern void* elf_start;
    extern size_t elf_size;
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    typedef boost::format fmt;
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    extern const char text_start[], text_end[];
    
    
    namespace mmu {
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    namespace bi = boost::intrusive;
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    class vma_compare {
    public:
        bool operator ()(const vma& a, const vma& b) const {
            return a.addr() < b.addr();
        }
    };
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    typedef boost::intrusive::set<vma,
                                  bi::compare<vma_compare>,
                                  bi::member_hook<vma,
                                                  bi::set_member_hook<>,
                                                  &vma::_vma_list_hook>,
                                  bi::optimize_size<true>
                                  > vma_list_base;
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    struct vma_list_type : vma_list_base {
        vma_list_type() {
            // insert markers for the edges of allocatable area
            // simplifies searches
    
            insert(*new anon_vma(addr_range(0, 0), 0, 0));
    
    Avi Kivity's avatar
    Avi Kivity committed
            uintptr_t e = 0x800000000000;
    
            insert(*new anon_vma(addr_range(e, e), 0, 0));
    
    Avi Kivity's avatar
    Avi Kivity committed
    };
    
    __attribute__((init_priority((int)init_prio::vma_list)))
    
    Avi Kivity's avatar
    Avi Kivity committed
    vma_list_type vma_list;
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    // A fairly coarse-grained mutex serializing modifications to both
    // vma_list and the page table itself.
    mutex vma_list_mutex;
    
    
    hw_ptep follow(pt_element pte)
    
        return hw_ptep::force(phys_cast<pt_element>(pte.next_pt_addr()));
    
    // 1's for the bits provided by the pte for this level
    // 0's for the bits provided by the virtual address for this level
    phys pte_level_mask(unsigned level)
    {
        auto shift = level * ilog2_roundup_constexpr(pte_per_page)
            + ilog2_roundup_constexpr(page_size);
        return ~((phys(1) << shift) - 1);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    const unsigned nlevels = 4;
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    void* phys_to_virt(phys pa)
    {
    
        // The ELF is mapped 1:1
        void* phys_addr = reinterpret_cast<void*>(pa);
    
        if ((phys_addr >= elf_start) && (phys_addr < elf_start + elf_size)) {
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    phys virt_to_phys_pt(void* virt)
    {
        auto v = reinterpret_cast<uintptr_t>(virt);
        auto pte = pt_element::force(processor::read_cr3());
        unsigned level = nlevels;
        while (level > 0 && !pte.large()) {
            assert(pte.present() || level == nlevels);
            --level;
            auto pt = follow(pte);
            pte = pt.at(pt_index(virt, level)).read();
        }
        assert(!pte.empty());
        auto mask = pte_level_mask(level);
        return (pte.addr(level != 0) & mask) | (v & ~mask);
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    phys virt_to_phys(void *virt)
    {
    
        if ((virt >= elf_start) && (virt < elf_start + elf_size)) {
    
    #if CONF_debug_memory
        if (virt > debug_base) {
            return virt_to_phys_pt(virt);
        }
    #endif
    
    
    Avi Kivity's avatar
    Avi Kivity committed
        // For now, only allow non-mmaped areas.  Later, we can either
        // bounce such addresses, or lock them in memory and translate
    
        assert(virt >= phys_mem);
        return static_cast<char*>(virt) - phys_mem;
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    phys allocate_intermediate_level()
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        phys pt_page = virt_to_phys(memory::alloc_page());
    
        // since the pt is not yet mapped, we don't need to use hw_ptep
    
    Avi Kivity's avatar
    Avi Kivity committed
        pt_element* pt = phys_cast<pt_element>(pt_page);
    
        for (auto i = 0; i < pte_per_page; ++i) {
    
            pt[i] = make_empty_pte();
    
        return pt_page;
    }
    
    void allocate_intermediate_level(hw_ptep ptep)
    {
        phys pt_page = allocate_intermediate_level();
        ptep.write(make_normal_pte(pt_page));
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    void free_intermediate_level(hw_ptep ptep)
    
        hw_ptep pt = follow(ptep.read());
    
        for (auto i = 0; i < pte_per_page; ++i) {
    
            assert(pt.at(i).read().empty()); // don't free a level which still has pages!
    
    Avi Kivity's avatar
    Avi Kivity committed
        auto v = pt.release();
    
        ptep.write(make_empty_pte());
    
    Avi Kivity's avatar
    Avi Kivity committed
        // FIXME: flush tlb
        memory::free_page(v);
    
    void change_perm(hw_ptep ptep, unsigned int perm)
    
        pt_element pte = ptep.read();
    
        // Note: in x86, if the present bit (0x1) is off, not only read is
        // disallowed, but also write and exec. So in mprotect, if any
        // permission is requested, we must also grant read permission.
        // Linux does this too.
    
        pte.set_present(perm);
        pte.set_writable(perm & perm_write);
        pte.set_nx(!(perm & perm_exec));
    
        ptep.write(pte);
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    void split_large_page(hw_ptep ptep, unsigned level)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        pt_element pte_orig = ptep.read();
    
    Avi Kivity's avatar
    Avi Kivity committed
        if (level == 1) {
    
            pte_orig.set_large(false);
    
    Avi Kivity's avatar
    Avi Kivity committed
        }
    
    Avi Kivity's avatar
    Avi Kivity committed
        allocate_intermediate_level(ptep);
    
        auto pt = follow(ptep.read());
    
        for (auto i = 0; i < pte_per_page; ++i) {
    
            pt_element tmp = pte_orig;
    
            phys addend = phys(i) << (page_size_shift + pte_per_page_shift * (level - 1));
    
            tmp.set_addr(tmp.addr(level > 1) | addend, level > 1);
    
            pt.at(i).write(tmp);
    
    Avi Kivity's avatar
    Avi Kivity committed
        }
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    struct fill_page {
    public:
    
        virtual void fill(void* addr, uint64_t offset, uintptr_t size) = 0;
    
    Avi Kivity's avatar
    Avi Kivity committed
    };
    
    
    void debug_count_ptes(pt_element pte, int level, size_t &nsmall, size_t &nhuge)
    {
    
        if (level<4 && !pte.present()){
    
        } else if (pte.large()) {
    
            nhuge++;
        } else if (level==0){
            nsmall++;
        } else {
    
            hw_ptep pt = follow(pte);
    
            for(int i=0; i<pte_per_page; ++i) {
    
                debug_count_ptes(pt.at(i).read(), level-1, nsmall, nhuge);
    
    
    void tlb_flush_this_processor()
    
        // TODO: we can use page_table_root instead of read_cr3(), can be faster
        // when shadow page tables are used.
        processor::write_cr3(processor::read_cr3());
    
    // tlb_flush() does TLB flush on *all* processors, not returning before all
    
    // processors confirm flushing their TLB. This is slow, but necessary for
    // correctness so that, for example, after mprotect() returns, no thread on
    // no cpu can write to the protected page.
    
    mutex tlb_flush_mutex;
    sched::thread *tlb_flush_waiter;
    std::atomic<int> tlb_flush_pendingconfirms;
    
    inter_processor_interrupt tlb_flush_ipi{[] {
            tlb_flush_this_processor();
            if (tlb_flush_pendingconfirms.fetch_add(-1) == 1) {
                tlb_flush_waiter->wake();
            }
    }};
    
    void tlb_flush()
    {
        tlb_flush_this_processor();
    
        if (sched::cpus.size() <= 1)
    
            return;
        std::lock_guard<mutex> guard(tlb_flush_mutex);
    
        tlb_flush_waiter = sched::thread::current();
        tlb_flush_pendingconfirms.store((int)sched::cpus.size() - 1);
        tlb_flush_ipi.send_allbutself();
        sched::thread::wait_until([] {
                return tlb_flush_pendingconfirms.load() == 0;
        });
    }
    
    /*
     * a page_range_operation implementation operates (via the operate() method)
     * on a page-aligned byte range of virtual memory. The range is divided into a
     * bulk of aligned huge pages (2MB pages), and if the beginning and end
     * addresses aren't 2MB aligned, there are additional small pages (4KB pages).
    
     * The appropriate method (small_page() or huge_page()) is called for each of
     * these pages, to implement the operation.
    
     * By supporting operations directly on whole huge pages, we allow for smaller
     * pages and better TLB efficiency.
     *
     * TODO: Instead of walking the page table from its root for each page (small
     * or huge), we can more efficiently walk the page table once calling
    
     * small_page/huge_page for relevant page table entries. See linear_map for
     * an example on how this walk can be done.
    
     */
    class page_range_operation {
    public:
        void operate(void *start, size_t size);
    
    Nadav Har'El's avatar
    Nadav Har'El committed
        void operate(const vma &vma){ operate((void*)vma.start(), vma.size()); }
    
    protected:
        // offset is the offset of this page in the entire address range
        // (in case the operation needs to know this).
    
        virtual void small_page(hw_ptep ptep, uintptr_t offset) = 0;
        virtual void huge_page(hw_ptep ptep, uintptr_t offset) = 0;
    
        virtual bool should_allocate_intermediate() = 0;
    private:
        void operate_page(bool huge, void *addr, uintptr_t offset);
    };
    
    void page_range_operation::operate(void *start, size_t size)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        start = align_down(start, page_size);
        size = align_up(size, page_size);
        void *end = start + size; // one byte after the end
    
        // Find the largest 2MB-aligned range inside the given byte (or actually,
        // 4K-aligned) range:
        auto hp_start = align_up(start, huge_page_size);
        auto hp_end = align_down(end, huge_page_size);
    
        // Fix the hp_start/hp_end in degenerate cases so the following
        // loops do the right thing.
        if (hp_start > end) {
            hp_start = end;
        }
        if (hp_end < start) {
            hp_end = end;
        }
    
        for (void *addr = start; addr < hp_start; addr += page_size) {
            operate_page(false, addr, (uintptr_t)addr-(uintptr_t)start);
        }
        for (void *addr = hp_start; addr < hp_end; addr += huge_page_size) {
            operate_page(true, addr, (uintptr_t)addr-(uintptr_t)start);
        }
        for (void *addr = hp_end; addr < end; addr += page_size) {
            operate_page(false, addr, (uintptr_t)addr-(uintptr_t)start);
    
    Avi Kivity's avatar
    Avi Kivity committed
        }
    
    
        // TODO: consider if instead of requesting a full TLB flush, we should
        // instead try to make more judicious use of INVLPG - e.g., in
        // split_large_page() and other specific places where we modify specific
        // page table entries.
    
        // TODO: Consider if we're doing tlb_flush() too often, e.g., twice
        // in one mmap which first does evacuate() and then allocate().
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    void page_range_operation::operate_page(bool huge, void *addr, uintptr_t offset)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        pt_element pte = pt_element::force(processor::read_cr3());
    
        auto pt = follow(pte);
    
        auto ptep = pt.at(pt_index(addr, nlevels - 1));
    
        unsigned stopat = huge ? 1 : 0;
        while (level > stopat) {
    
            pte = ptep.read();
    
    Nadav Har'El's avatar
    Nadav Har'El committed
            if (pte.empty()) {
    
                if (should_allocate_intermediate()) {
                    allocate_intermediate_level(ptep);
    
                    pte = ptep.read();
    
            } else if (pte.large()) {
    
                // We're trying to change a small page out of a huge page (or
                // in the future, potentially also 2 MB page out of a 1 GB),
                // so we need to first split the large page into smaller pages.
                // Our implementation ensures that it is ok to free pieces of a
                // alloc_huge_page() with free_page(), so it is safe to do such a
                // split.
    
                pte = ptep.read();
    
            ptep = pt.at(pt_index(addr, level));
    
        if(huge) {
            huge_page(ptep, offset);
        } else {
            small_page(ptep, offset);
    
    }
    
    /*
     * populate() populates the page table with the entries it is (assumed to be)
    
     * missing to span the given virtual-memory address range, and then pre-fills
     * (using the given fill function) these pages and sets their permissions to
     * the given ones. This is part of the mmap implementation.
    
    class populate : public page_range_operation {
    private:
        fill_page *fill;
        unsigned int perm;
    public:
        populate(fill_page *fill, unsigned int perm) : fill(fill), perm(perm) { }
    protected:
    
        virtual void small_page(hw_ptep ptep, uintptr_t offset){
    
            if (!ptep.read().empty()) {
                return;
            }
    
            phys page = virt_to_phys(memory::alloc_page());
    
            fill->fill(phys_to_virt(page), offset, page_size);
    
            if (!ptep.compare_exchange(make_empty_pte(), make_normal_pte(page, perm))) {
                memory::free_page(phys_to_virt(page));
            }
    
        virtual void huge_page(hw_ptep ptep, uintptr_t offset){
    
            auto pte = ptep.read();
            if (!pte.empty()) {
                if (pte.large()) {
                    return;
                }
    
                // held smallpages (already evacuated), now will be used for huge page
                free_intermediate_level(ptep);
            }
    
            void *vpage = memory::alloc_huge_page(huge_page_size);
            if (!vpage) {
    
                phys pt_page = allocate_intermediate_level();
                if (!ptep.compare_exchange(make_empty_pte(), make_normal_pte(pt_page))) {
                    memory::free_page(phys_to_virt(pt_page));
    
                    return;
                }
    
                // If the current huge page operation failed, we can try to execute
                // it again filling the range with the equivalent number of small
                // pages.  We will do it for this page, but the next huge-page
                // aligned address may succeed (if there are frees between now and
                // then, for example).
                hw_ptep pt = follow(ptep.read());
                for (int i=0; i<pte_per_page; ++i) {
                    small_page(pt.at(i), offset);
                }
                return;
            }
    
            phys page = virt_to_phys(vpage);
            fill->fill(vpage, offset, huge_page_size);
    
            if (!ptep.compare_exchange(make_empty_pte(), make_large_pte(page, perm))) {
                memory::free_huge_page(phys_to_virt(page), huge_page_size);
            }
    
        virtual bool should_allocate_intermediate(){
            return true;
    
    };
    
    /*
     * Undo the operation of populate(), freeing memory allocated by populate()
     * and marking the pages non-present.
     */
    class unpopulate : public page_range_operation {
    protected:
    
        virtual void small_page(hw_ptep ptep, uintptr_t offset){
    
            // Note: we free the page even if it is already marked "not present".
            // evacuate() makes sure we are only called for allocated pages, and
            // not-present may only mean mprotect(PROT_NONE).
    
    Avi Kivity's avatar
    Avi Kivity committed
            pt_element pte = ptep.read();
    
            if (pte.empty()) {
                return;
            }
    
            ptep.write(make_empty_pte());
    
    Avi Kivity's avatar
    Avi Kivity committed
            // FIXME: tlb flush
            memory::free_page(phys_to_virt(pte.addr(false)));
    
        virtual void huge_page(hw_ptep ptep, uintptr_t offset){
    
    Avi Kivity's avatar
    Avi Kivity committed
            pt_element pte = ptep.read();
            ptep.write(make_empty_pte());
            // FIXME: tlb flush
    
            if (pte.empty()) {
                return;
            }
    
    Nadav Har'El's avatar
    Nadav Har'El committed
            if (pte.large()) {
    
    Avi Kivity's avatar
    Avi Kivity committed
                memory::free_huge_page(phys_to_virt(pte.addr(true)),
    
                        huge_page_size);
    
            } else {
                // We've previously allocated small pages here, not a huge pages.
                // We need to free them one by one - as they are not necessarily part
                // of one huge page.
    
    Avi Kivity's avatar
    Avi Kivity committed
                hw_ptep pt = follow(pte);
    
                for(int i=0; i<pte_per_page; ++i) {
    
                    if (pt.at(i).read().empty()) {
                        continue;
                    }
    
    Avi Kivity's avatar
    Avi Kivity committed
                    pt_element pte = pt.at(i).read();
                    // FIXME: tlb flush?
                    pt.at(i).write(make_empty_pte());
                    memory::free_page(phys_to_virt(pte.addr(false)));
    
    Avi Kivity's avatar
    Avi Kivity committed
                memory::free_page(pt.release());
    
            }
        }
        virtual bool should_allocate_intermediate(){
            return false;
        }
    };
    
    class protection : public page_range_operation {
    private:
        unsigned int perm;
    public:
    
        protection(unsigned int perm) : perm(perm) { }
    
        virtual void small_page(hw_ptep ptep, uintptr_t offset){
             if (ptep.read().empty()) {
    
                return;
            }
            change_perm(ptep, perm);
         }
    
        virtual void huge_page(hw_ptep ptep, uintptr_t offset){
            if (ptep.read().empty()) {
    
            } else if (ptep.read().large()) {
    
                change_perm(ptep, perm);
            } else {
    
                hw_ptep pt = follow(ptep.read());
    
                for (int i=0; i<pte_per_page; ++i) {
    
                    if (!pt.at(i).read().empty()) {
                        change_perm(pt.at(i), perm);
    
                    }
                }
            }
        }
        virtual bool should_allocate_intermediate(){
            return false;
        }
    };
    
    
    Pekka Enberg's avatar
    Pekka Enberg committed
    bool contains(uintptr_t start, uintptr_t end, vma& y)
    {
        return y.start() >= start && y.end() <= end;
    }
    
    /**
     * Change virtual memory range protection
     *
     * Change protection for a virtual memory range.  Updates page tables and VMas
     * for populated memory regions and just VMAs for unpopulated ranges.
     *
     */
    
    void protect(void *addr, size_t size, unsigned int perm)
    
    Pekka Enberg's avatar
    Pekka Enberg committed
        uintptr_t start = reinterpret_cast<uintptr_t>(addr);
        uintptr_t end = start + size;
        addr_range r(start, end);
    
        std::lock_guard<mutex> guard(vma_list_mutex);
    
    Pekka Enberg's avatar
    Pekka Enberg committed
        auto range = vma_list.equal_range(r, vma::addr_compare());
        for (auto i = range.first; i != range.second; ++i) {
            if (i->perm() == perm)
                continue;
            i->split(end);
            i->split(start);
            if (contains(start, end, *i)) {
                i->protect(perm);
            }
        }
    
        protection p(perm);
        p.operate(addr, size);
    
    Avi Kivity's avatar
    Avi Kivity committed
    uintptr_t find_hole(uintptr_t start, uintptr_t size)
    {
        // FIXME: use lower_bound or something
        auto p = vma_list.begin();
        auto n = std::next(p);
        while (n != vma_list.end()) {
            if (start >= p->end() && start + size <= n->start()) {
                return start;
    
    Avi Kivity's avatar
    Avi Kivity committed
            if (p->end() >= start && n->start() - p->end() >= size) {
                return p->end();
            }
            p = n;
            ++n;
    
    Avi Kivity's avatar
    Avi Kivity committed
        abort();
    }
    
    
    void evacuate(uintptr_t start, uintptr_t end)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        addr_range r(start, end);
    
        std::lock_guard<mutex> guard(vma_list_mutex);
    
        auto range = vma_list.equal_range(r, vma::addr_compare());
        for (auto i = range.first; i != range.second; ++i) {
    
            i->split(end);
            i->split(start);
            if (contains(start, end, *i)) {
    
    Avi Kivity's avatar
    Avi Kivity committed
                auto& dead = *i--;
    
                unpopulate().operate(dead);
    
    Avi Kivity's avatar
    Avi Kivity committed
                vma_list.erase(dead);
    
                delete &dead;
    
    Avi Kivity's avatar
    Avi Kivity committed
            }
        }
    
        // FIXME: range also indicates where we can insert a new anon_vma, use it
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    void unmap(void* addr, size_t size)
    {
    
        size = align_up(size, mmu::page_size);
    
        auto start = reinterpret_cast<uintptr_t>(addr);
    
        evacuate(start, start+size);
    
    error msync(void* addr, size_t length, int flags)
    {
        length = align_up(length, mmu::page_size);
        auto start = reinterpret_cast<uintptr_t>(addr);
        auto end = start+length;
        auto err = make_error(ENOMEM);
    
        addr_range r(start, end);
    
        WITH_LOCK(vma_list_mutex) {
    
            auto range = vma_list.equal_range(r, vma::addr_compare());
            for (auto i = range.first; i != range.second; ++i) {
                err = i->sync(start, end);
                if (err.bad()) {
                    break;
    
    Avi Kivity's avatar
    Avi Kivity committed
    struct fill_anon_page : fill_page {
    
        virtual void fill(void* addr, uint64_t offset, uintptr_t size) {
            memset(addr, 0, size);
    
    struct fill_anon_page_noinit: fill_page {
        virtual void fill(void* addr, uint64_t offset, uintptr_t size) {
        }
    };
    
    
    uintptr_t allocate(vma *v, uintptr_t start, size_t size, bool search)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        if (search) {
            // search for unallocated hole around start
            if (!start) {
                start = 0x200000000000ul;
            }
            start = find_hole(start, size);
    
            v->set(start, start+size);
    
        } else {
            // we don't know if the given range is free, need to evacuate it first
            evacuate(start, start+size);
        }
    
        vma_list.insert(*v);
    
        return start;
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    void vpopulate(void* addr, size_t size)
    {
        fill_anon_page fill;
    
        WITH_LOCK(vma_list_mutex) {
    
            populate(&fill, perm_rwx).operate(addr, size);
    
    }
    
    void vdepopulate(void* addr, size_t size)
    {
    
        WITH_LOCK(vma_list_mutex) {
    
            unpopulate().operate(addr, size);
    
    void* map_anon(void* addr, size_t size, unsigned flags, unsigned perm)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        bool search = !(flags & mmap_fixed);
    
        size = align_up(size, mmu::page_size);
    
    Avi Kivity's avatar
    Avi Kivity committed
        auto start = reinterpret_cast<uintptr_t>(addr);
    
        auto* vma = new mmu::anon_vma(addr_range(start, start + size), perm, flags);
    
        std::lock_guard<mutex> guard(vma_list_mutex);
    
        auto v = (void*) allocate(vma, start, size, search);
        if (flags & mmap_populate) {
    
            if (flags & mmap_uninitialized) {
                fill_anon_page_noinit zfill;
                populate(&zfill, perm).operate(v, size);
            } else {
                fill_anon_page zfill;
                populate(&zfill, perm).operate(v, size);
            }
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    void* map_file(void* addr, size_t size, unsigned flags, unsigned perm,
                  fileref f, f_offset offset)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        bool search = !(flags & mmu::mmap_fixed);
        bool shared = flags & mmu::mmap_shared;
    
    Avi Kivity's avatar
    Avi Kivity committed
        auto asize = align_up(size, mmu::page_size);
    
    Avi Kivity's avatar
    Avi Kivity committed
        auto start = reinterpret_cast<uintptr_t>(addr);
    
    Avi Kivity's avatar
    Avi Kivity committed
        fill_anon_page zfill;
    
        auto *vma = new mmu::file_vma(addr_range(start, start + size), perm, f, offset, shared);
    
        void *v;
        WITH_LOCK(vma_list_mutex) {
            v = (void*) allocate(vma, start, asize, search);
            populate(&zfill, perm | perm_write).operate(v, asize);
        }
    
    Avi Kivity's avatar
    Avi Kivity committed
        auto fsize = ::size(f);
        // FIXME: we pre-zeroed this, and now we're overwriting the zeroes
        if (offset < fsize) {
            read(f, v, offset, std::min(size, fsize - offset));
        }
        // FIXME: do this more cleverly, avoiding a second pass
        if (!(perm & perm_write)) {
            protect(v, asize, perm);
        }
        return v;
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    bool is_linear_mapped(void *addr, size_t size)
    {
        if ((addr >= elf_start) && (addr + size <= elf_start + elf_size)) {
            return true;
        }
        return addr >= phys_mem;
    }
    
    
    // Checks if the entire given memory region is mmap()ed (in vma_list).
    
    bool ismapped(void *addr, size_t size)
    {
        uintptr_t start = (uintptr_t) addr;
        uintptr_t end = start + size;
    
        addr_range r(start, end);
    
    
        std::lock_guard<mutex> guard(vma_list_mutex);
    
    
        auto range = vma_list.equal_range(r, vma::addr_compare());
        for (auto p = range.first; p != range.second; ++p) {
    
            if (p->start() > start)
                return false;
            start = p->end();
            if (start >= end)
                return true;
        }
        return false;
    }
    
    
    // Checks if the entire given memory region is readable.
    bool isreadable(void *addr, size_t size)
    {
        char *end = align_up((char *)addr + size, mmu::page_size);
        char tmp;
        for (char *p = (char *)addr; p < end; p += mmu::page_size) {
            if (!safe_load(p, tmp))
                return false;
        }
        return true;
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    namespace {
    
    Avi Kivity's avatar
    Avi Kivity committed
    uintptr_t align_down(uintptr_t ptr)
    {
        return ptr & ~(page_size - 1);
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    uintptr_t align_up(uintptr_t ptr)
    {
        return align_down(ptr + page_size - 1);
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    bool access_fault(vma& vma, unsigned long error_code)
    {
        auto perm = vma.perm();
        if (error_code & page_fault_insn) {
            return true;
        }
        if (error_code & page_fault_write) {
            return !(perm & perm_write);
        }
        return !(perm & perm_read);
    }
    
    TRACEPOINT(trace_mmu_vm_fault, "addr=%p, error_code=%x", uintptr_t, u16);
    TRACEPOINT(trace_mmu_vm_fault_sigsegv, "addr=%p, error_code=%x", uintptr_t, u16);
    TRACEPOINT(trace_mmu_vm_fault_ret, "addr=%p, error_code=%x", uintptr_t, u16);
    
    void vm_sigsegv(uintptr_t addr, exception_frame* ef)
    {
        auto pc = reinterpret_cast<void*>(ef->rip);
        if (pc >= text_start && pc < text_end) {
            abort("page fault outside application");
        }
        osv::handle_segmentation_fault(addr, ef);
    }
    
    void vm_fault(uintptr_t addr, exception_frame* ef)
    {
        trace_mmu_vm_fault(addr, ef->error_code);
        addr = align_down(addr);
        WITH_LOCK(vma_list_mutex) {
            auto vma = vma_list.find(addr_range(addr, addr+1), vma::addr_compare());
            if (vma == vma_list.end() || access_fault(*vma, ef->error_code)) {
                vm_sigsegv(addr, ef);
                trace_mmu_vm_fault_sigsegv(addr, ef->error_code);
                return;
            }
    
        }
        trace_mmu_vm_fault_ret(addr, ef->error_code);
    }
    
    
    vma::vma(addr_range range, unsigned perm, unsigned flags)
        : _range(align_down(range.start()), align_up(range.end()))
    
    Pekka Enberg's avatar
    Pekka Enberg committed
        , _perm(perm)
    
        , _flags(flags)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    Pekka Enberg's avatar
    Pekka Enberg committed
    vma::~vma()
    {
    }
    
    
    void vma::set(uintptr_t start, uintptr_t end)
    {
    
        _range = addr_range(align_down(start), align_up(end));
    
    Pekka Enberg's avatar
    Pekka Enberg committed
    void vma::protect(unsigned perm)
    {
        _perm = perm;
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    uintptr_t vma::start() const
    {
    
        return _range.start();
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    uintptr_t vma::end() const
    {
    
        return _range.end();
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    void* vma::addr() const
    {
    
        return reinterpret_cast<void*>(_range.start());
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    uintptr_t vma::size() const
    {
    
        return _range.end() - _range.start();
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Pekka Enberg's avatar
    Pekka Enberg committed
    unsigned vma::perm() const
    {
        return _perm;
    }
    
    
    anon_vma::anon_vma(addr_range range, unsigned perm, unsigned flags)
        : vma(range, perm, flags)
    
    {
    }
    
    void anon_vma::split(uintptr_t edge)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        if (edge <= _range.start() || edge >= _range.end()) {
    
    Avi Kivity's avatar
    Avi Kivity committed
            return;
    
        vma* n = new anon_vma(addr_range(edge, _range.end()), _perm, _flags);
        _range = addr_range(_range.start(), edge);
    
    Avi Kivity's avatar
    Avi Kivity committed
        vma_list.insert(*n);
    }
    
    error anon_vma::sync(uintptr_t start, uintptr_t end)
    
        return no_error();
    
    void anon_vma::fault(uintptr_t addr, exception_frame *ef)
    
        auto hp_start = ::align_up(_range.start(), huge_page_size);
        auto hp_end = ::align_down(_range.end(), huge_page_size);
    
        size_t size;
        if (hp_start <= addr && addr < hp_end) {
            addr = ::align_down(addr, huge_page_size);
            size = huge_page_size;
        } else {
            size = page_size;
        }
    
    
        if (_flags & mmap_uninitialized) {
            fill_anon_page_noinit zfill;
            populate(&zfill, _perm).operate((void*)addr, size);
        } else {
            fill_anon_page zfill;
            populate(&zfill, _perm).operate((void*)addr, size);
        }
    
    file_vma::file_vma(addr_range range, unsigned perm, fileref file, f_offset offset, bool shared)
        : vma(range, perm, 0)
    
    Pekka Enberg's avatar
    Pekka Enberg committed
        , _file(file)
        , _offset(offset)
        , _shared(shared)
    {
    }
    
    
    void file_vma::split(uintptr_t edge)
    {
    
        if (edge <= _range.start() || edge >= _range.end()) {
    
            return;
        }
        auto off = offset(edge);
    
        vma* n = new file_vma(addr_range(edge, _range.end()), _perm, _file, off, _shared);
        _range = addr_range(_range.start(), edge);
    
        vma_list.insert(*n);
    }
    
    
    error file_vma::sync(uintptr_t start, uintptr_t end)
    {
        if (!_shared)
            return make_error(ENOMEM);
    
        start = std::max(start, _range.start());
        end = std::min(end, _range.end());
    
        auto fsize = ::size(_file);
        uintptr_t size = end - start;
    
        auto off = offset(start);
        write(_file, addr(), off, std::min(size, fsize - off));
    
        auto err = sys_fsync(_file.get());
        return make_error(err);
    }
    
    
    void file_vma::fault(uintptr_t addr, exception_frame *ef)
    
    {
        abort("trying to fault-in file-backed vma");
    }
    
    
    f_offset file_vma::offset(uintptr_t addr)
    {
    
        return _offset + (addr - _range.start());
    
    Avi Kivity's avatar
    Avi Kivity committed
    unsigned nr_page_sizes = 2; // FIXME: detect 1GB pages
    
    void set_nr_page_sizes(unsigned nr)
    {
        nr_page_sizes = nr;
    }
    
    
    Avi Kivity's avatar
    Avi Kivity committed
    pt_element page_table_root;
    
    Avi Kivity's avatar
    Avi Kivity committed
    void clamp(uintptr_t& vstart1, uintptr_t& vend1,
               uintptr_t min, size_t max, size_t slop)
    {
        vstart1 &= ~(slop - 1);
        vend1 |= (slop - 1);
        vstart1 = std::max(vstart1, min);
        vend1 = std::min(vend1, max);
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    unsigned pt_index(uintptr_t virt, unsigned level)
    {
        return pt_index(reinterpret_cast<void*>(virt), level);
    }
    
    void linear_map_level(hw_ptep parent, uintptr_t vstart, uintptr_t vend,
    
    Avi Kivity's avatar
    Avi Kivity committed
            phys delta, uintptr_t base_virt, size_t slop, unsigned level)
    {
        --level;
    
        if (!parent.read().present()) {
            allocate_intermediate_level(parent);
    
    Avi Kivity's avatar
    Avi Kivity committed
        }
    
        hw_ptep pt = follow(parent.read());
    
        phys step = phys(1) << (page_size_shift + level * pte_per_page_shift);
    
    Avi Kivity's avatar
    Avi Kivity committed
        auto idx = pt_index(vstart, level);
        auto eidx = pt_index(vend, level);
        base_virt += idx * step;
        base_virt = (s64(base_virt) << 16) >> 16; // extend 47th bit
        while (idx <= eidx) {
            uintptr_t vstart1 = vstart, vend1 = vend;
            clamp(vstart1, vend1, base_virt, base_virt + step - 1, slop);
            if (level < nr_page_sizes && vstart1 == base_virt && vend1 == base_virt + step - 1) {
    
                pt.at(idx).write(make_pte(vstart1 + delta, level > 0));
    
    Avi Kivity's avatar
    Avi Kivity committed
            } else {
    
                linear_map_level(pt.at(idx), vstart1, vend1, delta, base_virt, slop, level);
    
    Avi Kivity's avatar
    Avi Kivity committed
            base_virt += step;
            ++idx;
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    size_t page_size_level(unsigned level)
    {
    
        return size_t(1) << (page_size_shift + pte_per_page_shift * level);
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    void linear_map(void* _virt, phys addr, size_t size, size_t slop)
    
    Avi Kivity's avatar
    Avi Kivity committed
    {
    
        uintptr_t virt = reinterpret_cast<uintptr_t>(_virt);
    
    Avi Kivity's avatar
    Avi Kivity committed
        slop = std::min(slop, page_size_level(nr_page_sizes - 1));
        assert((virt & (slop - 1)) == (addr & (slop - 1)));
    
        linear_map_level(hw_ptep::force(&page_table_root), virt, virt + size - 1,
    
    Avi Kivity's avatar
    Avi Kivity committed
                addr - virt, 0, slop, 4);
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    void free_initial_memory_range(uintptr_t addr, size_t size)
    {
        memory::free_initial_memory_range(phys_cast<void>(addr), size);
    }
    
    void switch_to_runtime_page_table()
    {
    
        processor::write_cr3(page_table_root.next_pt_addr());
    
    Avi Kivity's avatar
    Avi Kivity committed
    }
    
    Avi Kivity's avatar
    Avi Kivity committed
    }