From 6f121c1556447dac8c3fc0985ec10ee74031be4c Mon Sep 17 00:00:00 2001
From: Nadav Har'El <nyh@cloudius-systems.com>
Date: Wed, 20 Mar 2013 19:26:48 +0200
Subject: [PATCH] Less repetitive mmap/unmap/mprotect code

Rewriten the mmap/unmap/mprotect to be much less repetitive.
There is a new "page_range_operation" class, from which the classes
"populate", "unpopulate", and "protect" classes derive to implement
mmap, munmap and mprotect respectively. The code is now much shorter,
less repetitive, clearer (I hope), and also better conforming to the
new coding conventions.

Note that linear_map is still separate, and page_range_operation keeps
its old algorithm (of starting at the root again for each page). Now
that we have this clean OO structure, it will be easier to change
this algorithm to be similar to linear_map's.
---
 core/mmu.cc | 452 ++++++++++++++++++++++++----------------------------
 1 file changed, 206 insertions(+), 246 deletions(-)

diff --git a/core/mmu.cc b/core/mmu.cc
index bac298826..17a8182dc 100644
--- a/core/mmu.cc
+++ b/core/mmu.cc
@@ -7,6 +7,7 @@
 #include <string.h>
 #include <iterator>
 #include "libc/signal.hh"
+#include "align.hh"
 
 namespace {
 
@@ -163,39 +164,89 @@ void debug_count_ptes(pt_element pte, int level, size_t &nsmall, size_t &nhuge)
     }
 }
 
-void populate_page(void* addr, fill_page& fill, uint64_t offset, unsigned perm)
+/*
+ * a page_range_operation implementation operates (via the operate() method)
+ * on a page-aligned byte range of virtual memory. The range is divided into a
+ * bulk of aligned huge pages (2MB pages), and if the beginning and end
+ * addresses aren't 2MB aligned, there are additional small pages (4KB pages).
+ * The appropriate method (set_small_page() or set_huge_page()) is called for
+ * each of these pages, to implement the operation.
+ * By supporting operations directly on whole huge pages, we allow for smaller
+ * pages and better TLB efficiency.
+ *
+ * TODO: Instead of walking the page table from its root for each page (small
+ * or huge), we can more efficiently walk the page table once calling
+ * small_page/huge_page for relevant page table entries (as well as avoid
+ * repeating the necessary allocation and split code, now repeated in
+ * all our small_page()/huge_page() implementations). See linear_map for
+ * an example on how to do this walk.
+ */
+class page_range_operation {
+public:
+    void operate(void *start, size_t size);
+    void operate(vma &vma){ operate((void*)vma.start(), vma.size()); }
+protected:
+    // offset is the offset of this page in the entire address range
+    // (in case the operation needs to know this).
+    virtual void small_page(pt_element *ptep, uintptr_t offset) = 0;
+    virtual void huge_page(pt_element *ptep, uintptr_t offset) = 0;
+    virtual bool should_allocate_intermediate() = 0;
+private:
+    void operate_page(bool huge, void *addr, uintptr_t offset);
+};
+
+void page_range_operation::operate(void *start, size_t size)
 {
-    pt_element pte = processor::read_cr3();
-    auto pt = phys_cast<pt_element>(pte_phys(pte));
-    auto ptep = &pt[pt_index(addr, nlevels - 1)];
-    unsigned level = nlevels - 1;
-    while (level > 0) {
-        if (!pte_present(*ptep)) {
-            allocate_intermediate_level(ptep);
-        } else if (pte_large(*ptep)) {
-            split_large_page(ptep, level);
-        }
-        pte = *ptep;
-        --level;
-        pt = phys_cast<pt_element>(pte_phys(pte));
-        ptep = &pt[pt_index(addr, level)];
+    start = align_down(start, page_size);
+    size = align_up(size, page_size);
+    void *end = start + size; // one byte after the end
+
+    // Find the largest 2MB-aligned range inside the given byte (or actually,
+    // 4K-aligned) range:
+    auto hp_start = align_up(start, huge_page_size);
+    auto hp_end = align_down(end, huge_page_size);
+
+    // Fix the hp_start/hp_end in degenerate cases so the following
+    // loops do the right thing.
+    if (hp_start > end) {
+        hp_start = end;
+    }
+    if (hp_end < start) {
+        hp_end = end;
+    }
+
+    for (void *addr = start; addr < hp_start; addr += page_size) {
+        operate_page(false, addr, (uintptr_t)addr-(uintptr_t)start);
+    }
+    for (void *addr = hp_start; addr < hp_end; addr += huge_page_size) {
+        operate_page(true, addr, (uintptr_t)addr-(uintptr_t)start);
+    }
+    for (void *addr = hp_end; addr < end; addr += page_size) {
+        operate_page(false, addr, (uintptr_t)addr-(uintptr_t)start);
     }
-    phys page = alloc_page();
-    fill.fill(phys_to_virt(page), offset);
-    assert(pte_phys(*ptep)==0); // don't populate an already populated page!
-    *ptep = make_pte(page, perm);
 }
 
-void populate_huge_page(void* addr, fill_page& fill, uint64_t offset, unsigned perm)
+void page_range_operation::operate_page(bool huge, void *addr, uintptr_t offset)
 {
     pt_element pte = processor::read_cr3();
     auto pt = phys_cast<pt_element>(pte_phys(pte));
     auto ptep = &pt[pt_index(addr, nlevels - 1)];
     unsigned level = nlevels - 1;
-    while (level > 1) {
+    unsigned stopat = huge ? 1 : 0;
+    while (level > stopat) {
         if (!pte_present(*ptep)) {
-            allocate_intermediate_level(ptep);
+            if (should_allocate_intermediate()) {
+                allocate_intermediate_level(ptep);
+            } else {
+                return;
+            }
         } else if (pte_large(*ptep)) {
+            // We're trying to change a small page out of a huge page (or
+            // in the future, potentially also 2 MB page out of a 1 GB),
+            // so we need to first split the large page into smaller pages.
+            // Our implementation ensures that it is ok to free pieces of a
+            // alloc_huge_page() with free_page(), so it is safe to do such a
+            // split.
             split_large_page(ptep, level);
         }
         pte = *ptep;
@@ -203,18 +254,11 @@ void populate_huge_page(void* addr, fill_page& fill, uint64_t offset, unsigned p
         pt = phys_cast<pt_element>(pte_phys(pte));
         ptep = &pt[pt_index(addr, level)];
     }
-    phys page = virt_to_phys(memory::alloc_huge_page(huge_page_size));
-    uint64_t o=0;
-    for (int i=0; i<pte_per_page; i++){
-        fill.fill(phys_to_virt(page+o), offset+o);
-        o += page_size;
-    }
-    if (pte_phys(*ptep)) {
-        assert(!pte_large(*ptep)); // don't populate an already populated page!
-        // held smallpages (already evacuated), now will be used for huge page
-        free_intermediate_level(ptep);
+    if(huge) {
+        huge_page(ptep, offset);
+    } else {
+        small_page(ptep, offset);
     }
-    *ptep = make_pte(page, perm) | (1<<7);
 }
 
 /*
@@ -228,126 +272,146 @@ void populate_huge_page(void* addr, fill_page& fill, uint64_t offset, unsigned p
  * is not 2MB aligned, we will need to apply the fill and perm only to a part
  * of a large page, in which case we must break the entire large page into its
  * constitutive small (4K) pages.
- *
- * FIXME: It would be nicer to, instead of iterating on all levels per page as
- * we do in populate_page/populate_huge_page, we walk once on the whole
- * hiearchy, as in linear_map.
  */
-void populate(vma& vma, fill_page& fill, unsigned perm)
-{
-    // Find the largest 2MB-aligned range inside the given byte (or actually,
-    // 4K-aligned) range:
-    uintptr_t hp_start = ((vma.start()-1) & ~(huge_page_size-1)) + huge_page_size;
-    uintptr_t hp_end = (vma.end()) & ~(huge_page_size-1);
-
-    if (hp_start > vma.end())
-        hp_start = vma.end();
-    if (hp_end < vma.start())
-        hp_end = vma.end();
-
-    /* Step 1: Break up the partial huge page (if any) in the beginning of the
-     * address range, and populate the small pages.
-     *  TODO: it would be more efficient not to walk all the levels all the time */
-    for (auto addr = vma.start(); addr < hp_start; addr += page_size)
-        populate_page(reinterpret_cast<void*>(addr), fill, addr-vma.start(), perm);
-    /* Step 2: Populate the huge pages (if any) in the middle of the range */
-    for (auto addr = hp_start; addr < hp_end; addr += huge_page_size)
-        populate_huge_page(reinterpret_cast<void*>(addr), fill, addr-vma.start(), perm);
-    /* Step 3: Break up the partial huge page (if any) at the end of the range */
-    for (auto addr = hp_end; addr < vma.end(); addr += page_size)
-        populate_page(reinterpret_cast<void*>(addr), fill, addr-vma.start(), perm);
-    //size_t nsmall=0, nhuge=0;
-    //debug_count_ptes(processor::read_cr3(), 4, nsmall, nhuge);
-    //debug(fmt("after population, page table contains %ld small pages, %ld huge") % nsmall % nhuge);
-
-}
-
-void unpopulate_page(void* addr)
-{
-    pt_element pte = processor::read_cr3();
-    auto pt = phys_cast<pt_element>(pte_phys(pte));
-    auto ptep = &pt[pt_index(addr, nlevels - 1)];
-    unsigned level = nlevels - 1;
-    while (level > 0) {
-        if (!pte_present(*ptep))
-            return;
-        else if (pte_large(*ptep)) {
-            // This case means that part of a larger mmap was mmapped over,
-            // previously a huge page was mapped, and now we need to free some
-            // of the small pages composing it. Luckily, in our implementation
-            // it is ok to free pieces of a alloc_huge_page() with free_page()
-            split_large_page(ptep, level);
+class populate : public page_range_operation {
+private:
+    fill_page *fill;
+    unsigned int perm;
+public:
+    populate(fill_page *fill, unsigned int perm) : fill(fill), perm(perm) { }
+protected:
+    virtual void small_page(pt_element *ptep, uintptr_t offset){
+        phys page = alloc_page();
+        fill->fill(phys_to_virt(page), offset);
+        assert(pte_phys(*ptep)==0); // don't populate an already populated page!
+        *ptep = make_pte(page, perm);
+    }
+    virtual void huge_page(pt_element *ptep, uintptr_t offset){
+        phys page = virt_to_phys(memory::alloc_huge_page(huge_page_size));
+        uint64_t o=0;
+        // Unfortunately, fill() is only coded for small-page-size chunks, we
+        // need to repeat it:
+        for (int i=0; i<pte_per_page; i++){
+            fill->fill(phys_to_virt(page+o), offset+o);
+            o += page_size;
         }
-        pte = *ptep;
-        --level;
-        pt = phys_cast<pt_element>(pte_phys(pte));
-        ptep = &pt[pt_index(addr, level)];
+        if (pte_phys(*ptep)) {
+            assert(!pte_large(*ptep)); // don't populate an already populated page!
+            // held smallpages (already evacuated), now will be used for huge page
+            free_intermediate_level(ptep);
+        }
+        *ptep = make_pte(page, perm) | (1<<7);
     }
-    // Note: we free the page even if it is already marked "not present".
-    // evacuate() makes sure we are only called for allocated pages, and
-    // not-present may only mean mprotect(PROT_NONE).
-    phys page=pte_phys(*ptep);
-    assert(page); // evacuate() shouldn't call us twice for the same page.
-    memory::free_page(phys_to_virt(page));
-    *ptep = 0;
-}
-
-void unpopulate_huge_page(void* addr)
-{
-    pt_element pte = processor::read_cr3();
-    auto pt = phys_cast<pt_element>(pte_phys(pte));
-    auto ptep = &pt[pt_index(addr, nlevels - 1)];
-    unsigned level = nlevels - 1;
-    while (level > 1) {
-        if (!pte_present(*ptep))
-            return;
-        else if (pte_large(*ptep))
-            split_large_page(ptep, level);
-        pte = *ptep;
-        --level;
-        pt = phys_cast<pt_element>(pte_phys(pte));
-        ptep = &pt[pt_index(addr, level)];
+    virtual bool should_allocate_intermediate(){
+        return true;
     }
-    if (!pte_present(*ptep)){
+};
+
+/*
+ * Undo the operation of populate(), freeing memory allocated by populate()
+ * and marking the pages non-present.
+ */
+class unpopulate : public page_range_operation {
+protected:
+    virtual void small_page(pt_element *ptep, uintptr_t offset){
         // Note: we free the page even if it is already marked "not present".
         // evacuate() makes sure we are only called for allocated pages, and
         // not-present may only mean mprotect(PROT_NONE).
         phys page=pte_phys(*ptep);
         assert(page); // evacuate() shouldn't call us twice for the same page.
-        memory::free_huge_page(phys_to_virt(page), huge_page_size);
-    } else if (pte_large(*ptep)){
-        memory::free_huge_page(phys_to_virt(pte_phys(*ptep)), huge_page_size);
-    } else {
-        // We've previously allocated small pages here, not a huge pages.
-        // We need to free them one by one - as they are not necessarily part
-        // of one huge page.
-        pt_element* pt = phys_cast<pt_element>(pte_phys(*ptep));
-        for(int i=0; i<pte_per_page; ++i)
-            if (pte_present(pt[i]))
-                memory::free_page(phys_to_virt(pte_phys(pt[i])));
+        memory::free_page(phys_to_virt(page));
+        *ptep = 0;
     }
-    *ptep = 0;
-}
+    virtual void huge_page(pt_element *ptep, uintptr_t offset){
+        if (!pte_present(*ptep)) {
+            // Note: we free the page even if it is already marked "not present".
+            // evacuate() makes sure we are only called for allocated pages, and
+            // not-present may only mean mprotect(PROT_NONE).
+            phys page=pte_phys(*ptep);
+            assert(page); // evacuate() shouldn't call us twice for the same page.
+            memory::free_huge_page(phys_to_virt(page), huge_page_size);
+        } else if (pte_large(*ptep)) {
+            memory::free_huge_page(phys_to_virt(pte_phys(*ptep)), huge_page_size);
+        } else {
+            // We've previously allocated small pages here, not a huge pages.
+            // We need to free them one by one - as they are not necessarily part
+            // of one huge page.
+            pt_element* pt = phys_cast<pt_element>(pte_phys(*ptep));
+            for(int i=0; i<pte_per_page; ++i)
+                if (pte_present(pt[i]))
+                    memory::free_page(phys_to_virt(pte_phys(pt[i])));
+        }
+        *ptep = 0;
+    }
+    virtual bool should_allocate_intermediate(){
+        return false;
+    }
+};
 
-/*
- * Undo the operation of populate(), freeing memory allocated by populate()
- * and marking the pages non-present.
- */
-void unpopulate(vma& vma)
+void change_perm(pt_element *ptep, unsigned int perm)
 {
-    uintptr_t hp_start = ((vma.start()-1) & ~(huge_page_size-1)) + huge_page_size;
-    uintptr_t hp_end = (vma.end()) & ~(huge_page_size-1);
-    if (hp_start > vma.end())
-        hp_start = vma.end();
-    if (hp_end < vma.start())
-        hp_end = vma.end();
+    // Note: in x86, if the present bit (0x1) is off, not only read is
+    // disallowed, but also write and exec. So in mprotect, if any
+    // permission is requested, we must also grant read permission.
+    // Linux does this too.
+    if (perm)
+        *ptep |= 0x1;
+    else
+        *ptep &= ~0x1;
+
+    if (perm & perm_write)
+        *ptep |= 0x2;
+    else
+        *ptep &= ~0x2;
 
-    for (auto addr = vma.start(); addr < hp_start; addr += page_size)
-        unpopulate_page(reinterpret_cast<void*>(addr));
-    for (auto addr = hp_start; addr < hp_end; addr += huge_page_size)
-        unpopulate_huge_page(reinterpret_cast<void*>(addr));
-    for (auto addr = hp_end; addr < vma.end(); addr += page_size)
-        unpopulate_page(reinterpret_cast<void*>(addr));
+    if (!(perm & perm_exec))
+        *ptep |= pt_element(0x8000000000000000);
+    else
+        *ptep &= ~pt_element(0x8000000000000000);
+}
+
+class protection : public page_range_operation {
+private:
+    unsigned int perm;
+    bool success;
+public:
+    protection(unsigned int perm) : perm(perm), success(true) { }
+    bool getsuccess(){ return success; }
+protected:
+    virtual void small_page(pt_element *ptep, uintptr_t offset){
+         if (!pte_phys(*ptep)) {
+            success = false;
+            return;
+        }
+        change_perm(ptep, perm);
+     }
+    virtual void huge_page(pt_element *ptep, uintptr_t offset){
+        if (!pte_phys(*ptep)) {
+            success = false;
+        } else if (pte_large(*ptep)) {
+            change_perm(ptep, perm);
+        } else {
+            pt_element* pt = phys_cast<pt_element>(pte_phys(*ptep));
+            for (int i=0; i<pte_per_page; ++i) {
+                if (pte_phys(pt[i])) {
+                    change_perm(&pt[i], perm);
+                } else {
+                    success = false;
+                }
+            }
+        }
+    }
+    virtual bool should_allocate_intermediate(){
+        success = false;
+        return false;
+    }
+};
+
+int protect(void *addr, size_t size, unsigned int perm)
+{
+    protection p(perm);
+    p.operate(addr, size);
+    return p.getsuccess();
 }
 
 uintptr_t find_hole(uintptr_t start, uintptr_t size)
@@ -383,7 +447,7 @@ void evacuate(vma* v)
         i->split(v->start());
         if (contains(*v, *i)) {
             auto& dead = *i--;
-            unpopulate(dead);
+            unpopulate().operate(dead);
             vma_list.erase(dead);
         }
     }
@@ -440,7 +504,7 @@ vma* allocate(uintptr_t start, uintptr_t end, fill_page& fill,
         evacuate(ret);
     vma_list.insert(*ret);
 
-    populate(*ret, fill, perm);
+    populate(&fill, perm).operate((void*)start, end-start);
 
     return ret;
 }
@@ -461,110 +525,6 @@ vma* map_file(void* addr, size_t size, unsigned perm,
     return ret;
 }
 
-void change_perm(pt_element *ptep, unsigned int perm)
-{
-    // Note: in x86, if the present bit (0x1) is off, not only read is
-    // disallowed, but also write and exec. So in mprotect, if any
-    // permission is requested, we must also grant read permission.
-    // Linux does this too.
-    if (perm)
-        *ptep |= 0x1;
-    else
-        *ptep &= ~0x1;
-
-    if (perm & perm_write)
-        *ptep |= 0x2;
-    else
-        *ptep &= ~0x2;
-
-    if (!(perm & perm_exec))
-        *ptep |= pt_element(0x8000000000000000);
-    else
-        *ptep &= ~pt_element(0x8000000000000000);
-}
-
-int protect_page(void *addr, unsigned int perm)
-{
-    pt_element pte = processor::read_cr3();
-    auto pt = phys_cast<pt_element>(pte_phys(pte));
-    auto ptep = &pt[pt_index(addr, nlevels - 1)];
-    unsigned level = nlevels - 1;
-    while (level > 0) {
-        if (!pte_phys(*ptep))
-            return 0;
-        else if (pte_large(*ptep)) {
-            // We're trying to change the protection of part of a huge page, so
-            // we need to split the huge page into small pages. This is fine
-            // because in in our implementation it is ok to free pieces of a
-            // alloc_huge_page() with free_page()
-            split_large_page(ptep, level);
-        }
-        pte = *ptep;
-        --level;
-        pt = phys_cast<pt_element>(pte_phys(pte));
-        ptep = &pt[pt_index(addr, level)];
-    }
-    if (!pte_phys(*ptep))
-        return 0;
-    change_perm(ptep, perm);
-    return 1;
-}
-
-int protect_huge_page(void *addr, unsigned int perm)
-{
-    pt_element pte = processor::read_cr3();
-    auto pt = phys_cast<pt_element>(pte_phys(pte));
-    auto ptep = &pt[pt_index(addr, nlevels - 1)];
-    unsigned level = nlevels - 1;
-    while (level > 1) {
-        if (!pte_present(*ptep))
-            return 0;
-        else if (pte_large(*ptep))
-            split_large_page(ptep, level);
-        pte = *ptep;
-        --level;
-        pt = phys_cast<pt_element>(pte_phys(pte));
-        ptep = &pt[pt_index(addr, level)];
-    }
-    if (!pte_phys(*ptep))
-        return 0;
-
-    if (pte_large(*ptep)){
-        change_perm(ptep, perm);
-        return 1;
-    } else {
-        int ret = 1;
-        pt_element* pt = phys_cast<pt_element>(pte_phys(*ptep));
-        for(int i=0; i<pte_per_page; ++i)
-            if(pte_phys(pt[i]))
-                change_perm(&pt[i], perm);
-            else
-                ret = 0;
-        return ret;
-    }
-}
-
-int protect(void *start, size_t size, unsigned int perm)
-{
-    void *end = start+size; // one byte after the end
-    void *hp_start = (void*) ((((uintptr_t)start-1) & ~(huge_page_size-1)) +
-            huge_page_size);
-    void *hp_end = (void*) ((uintptr_t)end & ~(huge_page_size-1));
-    if (hp_start > end)
-        hp_start = end;
-    if (hp_end < start)
-        hp_end = end;
-
-    int ret=1;
-    for (auto addr = start; addr < hp_start; addr += page_size)
-        ret &= protect_page(addr, perm);
-    for (auto addr = hp_start; addr < hp_end; addr += huge_page_size)
-        ret &= protect_huge_page(addr, perm);
-    for (auto addr = hp_end; addr < end; addr += page_size)
-        ret &= protect_page(addr, perm);
-    return ret;
-}
-
 
 namespace {
 
-- 
GitLab