From 3a24e7d5bf224891feab736c45c6c3549246b75b Mon Sep 17 00:00:00 2001 From: Avi Kivity <avi@cloudius-systems.com> Date: Sun, 20 Jan 2013 13:30:41 +0200 Subject: [PATCH] mmu: move physical memory map to ffff800000000000 This allows us to clear the area around the null pointer, and so trap errors. The switch is tricky wrt setting up memory, since we need the memory to be mapped in order to free it, but we need free memory to map (for the page tables). What we do is set up a temporart 1:1 map at ffff800000000000 that mirrors the first 1G map we already have from bootstreap, use that to free all of memory under 1G, then map and free everything else. --- arch/x64/arch-setup.cc | 110 ++++++++++++++++++++++++++++++++--------- arch/x64/loader.ld | 1 + mmu.cc | 66 ++++++++++++++++++++++++- mmu.hh | 2 + 4 files changed, 153 insertions(+), 26 deletions(-) diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc index a2fdb49a5..30662df33 100644 --- a/arch/x64/arch-setup.cc +++ b/arch/x64/arch-setup.cc @@ -1,6 +1,7 @@ #include "arch-setup.hh" #include "mempool.hh" #include "mmu.hh" +#include "processor.hh" #include "types.hh" #include <alloca.h> #include <string.h> @@ -38,34 +39,95 @@ struct e820ent { multiboot_info_type* multiboot_info; -void arch_setup_free_memory() +void setup_temporary_phys_map() { - // copy to stack so we don't free it now - auto mb = *multiboot_info; - auto tmp = alloca(mb.mmap_length); - memcpy(tmp, reinterpret_cast<void*>(mb.mmap_addr), mb.mmap_length); - auto p = tmp; - ulong edata; - asm ("lea .edata, %0" : "=rm"(edata)); - while (p < tmp + mb.mmap_length) { + // duplicate 1:1 mapping into phys_mem + u64 cr3 = processor::read_cr3(); + auto pt = reinterpret_cast<u64*>(cr3); + // assumes phys_mem = 0xffff800000000000 + pt[256] = pt[0]; +} + +void for_each_e820_entry(void* e820_buffer, unsigned size, void (*f)(e820ent e)) +{ + auto p = e820_buffer; + while (p < e820_buffer + size) { auto ent = static_cast<e820ent*>(p); if (ent->type == 1) { - memory::phys_mem_size += ent->size; - if (ent->addr < edata) { - u64 adjust = std::min(edata - ent->addr, ent->size); - ent->addr += adjust; - ent->size -= adjust; - } - // FIXME: limit to mapped 1GB for now - // later map all of memory and free it too - u64 memtop = 1 << 30; - if (ent->addr + ent->size >= memtop) { - auto excess = ent->addr + ent->size - memtop; - excess = std::min(ent->size, excess); - ent->size -= excess; - } - mmu::free_initial_memory_range(ent->addr, ent->size); + f(*ent); } p += ent->ent_size + 4; } } + +bool intersects(const e820ent& ent, u64 a) +{ + return a > ent.addr && a < ent.addr + ent.size; +} + +e820ent truncate_below(e820ent ent, u64 a) +{ + u64 delta = a - ent.addr; + ent.addr += delta; + ent.size -= delta; + return ent; +} + +e820ent truncate_above(e820ent ent, u64 a) +{ + u64 delta = ent.addr + ent.size - a; + ent.size -= delta; + return ent; +} + +void arch_setup_free_memory() +{ + static constexpr u64 phys_mem = 0xffff800000000000; + static ulong edata; + asm ("movl $.edata, %0" : "=rm"(edata)); + // copy to stack so we don't free it now + auto mb = *multiboot_info; + auto e820_buffer = alloca(mb.mmap_length); + auto e820_size = mb.mmap_length; + memcpy(e820_buffer, reinterpret_cast<void*>(mb.mmap_addr), e820_size); + for_each_e820_entry(e820_buffer, e820_size, [] (e820ent ent) { + memory::phys_mem_size += ent.size; + }); + constexpr u64 initial_map = 1 << 30; // 1GB mapped by startup code + setup_temporary_phys_map(); + + // setup all memory up to 1GB. We can't free any more, because no + // page tables have been set up, so we can't reference the memory being + // freed. + for_each_e820_entry(e820_buffer, e820_size, [] (e820ent ent) { + // can't free anything below edata, it's core code. + // FIXME: can free below 2MB. + if (ent.addr + ent.size <= edata) { + return; + } + if (intersects(ent, edata)) { + ent = truncate_below(ent, edata); + } + // ignore anything above 1GB, we haven't mapped it yet + if (intersects(ent, initial_map)) { + ent = truncate_above(ent, initial_map); + } + mmu::free_initial_memory_range(ent.addr, ent.size); + }); + mmu::linear_map(phys_mem, 0, initial_map, initial_map); + // map the core + mmu::linear_map(0, 0, edata, 0x200000); + // now that we have some free memory, we can start mapping the rest + mmu::switch_to_runtime_page_table(); + for_each_e820_entry(e820_buffer, e820_size, [] (e820ent ent) { + // Ignore memory already freed above + if (ent.addr + ent.size <= initial_map) { + return; + } + if (intersects(ent, initial_map)) { + ent = truncate_below(ent, edata); + } + mmu::linear_map(phys_mem + ent.addr, ent.addr, ent.size, ~0); + mmu::free_initial_memory_range(ent.addr, ent.size); + }); +} diff --git a/arch/x64/loader.ld b/arch/x64/loader.ld index 0475a9f63..92c84f365 100644 --- a/arch/x64/loader.ld +++ b/arch/x64/loader.ld @@ -40,6 +40,7 @@ SECTIONS .debug_weaknames 0 : { *(.debug_weaknames) } .gdb_index 0 : { *(.gdb_index) } .comment : { *(.comment) } + phys_mem = 0xffff800000000000; } PHDRS { text PT_LOAD FILEHDR PHDRS; diff --git a/mmu.cc b/mmu.cc index 93a15bd95..b033eb913 100644 --- a/mmu.cc +++ b/mmu.cc @@ -48,12 +48,12 @@ namespace mmu { template <typename T> T* phys_cast(phys pa) { - return reinterpret_cast<T*>(pa); + return reinterpret_cast<T*>(pa + 0xffff800000000000ull); } phys virt_to_phys(void *virt) { - return reinterpret_cast<phys>(virt); + return reinterpret_cast<phys>(virt) - 0xffff800000000000ull; } unsigned pt_index(void *virt, unsigned level) @@ -287,10 +287,72 @@ namespace mmu { vma_list.insert(*n); } + unsigned nr_page_sizes = 2; // FIXME: detect 1GB pages + + pt_element page_table_root; + + void clamp(uintptr_t& vstart1, uintptr_t& vend1, + uintptr_t min, size_t max, size_t slop) + { + vstart1 &= ~(slop - 1); + vend1 |= (slop - 1); + vstart1 = std::max(vstart1, min); + vend1 = std::min(vend1, max); + } + + unsigned pt_index(uintptr_t virt, unsigned level) + { + return pt_index(reinterpret_cast<void*>(virt), level); + } + + void linear_map_level(pt_element& parent, uintptr_t vstart, uintptr_t vend, + phys delta, uintptr_t base_virt, size_t slop, unsigned level) + { + --level; + if (!(parent & 1)) { + allocate_intermediate_level(&parent); + } + pt_element* pt = phys_cast<pt_element>(pte_phys(parent)); + pt_element step = pt_element(1) << (12 + level * 9); + auto idx = pt_index(vstart, level); + auto eidx = pt_index(vend, level); + base_virt += idx * step; + base_virt = (s64(base_virt) << 16) >> 16; // extend 47th bit + while (idx <= eidx) { + uintptr_t vstart1 = vstart, vend1 = vend; + clamp(vstart1, vend1, base_virt, base_virt + step - 1, slop); + if (level < nr_page_sizes && vstart1 == base_virt && vend1 == base_virt + step - 1) { + pt[idx] = (vstart1 + delta) | 0x67 | (level == 0 ? 0 : 0x80); + } else { + linear_map_level(pt[idx], vstart1, vend1, delta, base_virt, slop, level); + } + base_virt += step; + ++idx; + } + } + + size_t page_size_level(unsigned level) + { + return size_t(1) << (12 + 9 * level); + } + + void linear_map(uintptr_t virt, phys addr, size_t size, size_t slop) + { + slop = std::min(slop, page_size_level(nr_page_sizes - 1)); + assert((virt & (slop - 1)) == (addr & (slop - 1))); + linear_map_level(page_table_root, virt, virt + size - 1, + addr - virt, 0, slop, 4); + } + void free_initial_memory_range(uintptr_t addr, size_t size) { memory::free_initial_memory_range(phys_cast<void>(addr), size); } + + void switch_to_runtime_page_table() + { + processor::write_cr3(pte_phys(page_table_root)); + } } void page_fault(exception_frame *ef) diff --git a/mmu.hh b/mmu.hh index b3017ff7e..e4b766c23 100644 --- a/mmu.hh +++ b/mmu.hh @@ -42,7 +42,9 @@ namespace mmu { typedef uint64_t phys; phys virt_to_phys(void *virt); + void linear_map(uintptr_t virt, phys addr, size_t size, size_t slop); void free_initial_memory_range(uintptr_t addr, size_t size); + void switch_to_runtime_page_table(); } #endif -- GitLab