diff --git a/arch/x64/arch-cpu.cc b/arch/x64/arch-cpu.cc
index fb0e6cf04588e97417bef9966c5fb05dbd333920..1c82cea3e36d5fb765e4fdf30b4d055325b5be55 100644
--- a/arch/x64/arch-cpu.cc
+++ b/arch/x64/arch-cpu.cc
@@ -17,14 +17,13 @@ inline void arch_cpu::enter_exception()
         abort("nested exception");
     }
     in_exception = true;
-    auto& s = initstack.stack;
+    auto& s = percpu_exception_stack;
     set_exception_stack(s, sizeof(s));
 }
 
 inline void arch_cpu::exit_exception()
 {
-    auto& s = exception_stack;
-    set_exception_stack(s, sizeof(s));
+    set_exception_stack(&thread::current()->_arch);
     in_exception = false;
 }
 
diff --git a/arch/x64/arch-cpu.hh b/arch/x64/arch-cpu.hh
index 6c73c1dac98bd4be7deeb4f49cb36796d1903349..9c7531b057512b43d29af136752f7ae9eede2b0e 100644
--- a/arch/x64/arch-cpu.hh
+++ b/arch/x64/arch-cpu.hh
@@ -40,7 +40,8 @@ struct arch_cpu {
     arch_cpu();
     processor::aligned_task_state_segment atss;
     init_stack initstack;
-    char exception_stack[4096] __attribute__((aligned(16)));
+    // The per-CPU exception stack is used for nested exceptions.
+    char percpu_exception_stack[4096] __attribute__((aligned(16)));
     u32 apic_id;
     u32 acpi_id;
     u64 gdt[nr_gdt];
@@ -48,6 +49,7 @@ struct arch_cpu {
     void init_on_cpu();
     void set_ist_entry(unsigned ist, char* base, size_t size);
     void set_exception_stack(char* base, size_t size);
+    void set_exception_stack(arch_thread* t);
     void set_interrupt_stack(arch_thread* t);
     void enter_exception();
     void exit_exception();
@@ -55,6 +57,7 @@ struct arch_cpu {
 
 struct arch_thread {
     char interrupt_stack[4096] __attribute__((aligned(16)));
+    char exception_stack[4096] __attribute__((aligned(16)));
 };
 
 
@@ -91,7 +94,8 @@ inline arch_cpu::arch_cpu()
     gdt[gdt_tss] |= (tss_addr & 0x00ffffff) << 16;
     gdt[gdt_tss] |= (tss_addr & 0xff000000) << 32;
     gdt[gdt_tssx] = tss_addr >> 32;
-    set_exception_stack(exception_stack, sizeof(exception_stack));
+    // Use the per-CPU stack for early boot faults.
+    set_exception_stack(percpu_exception_stack, sizeof(percpu_exception_stack));
 }
 
 inline void arch_cpu::set_ist_entry(unsigned ist, char* base, size_t size)
@@ -104,6 +108,12 @@ inline void arch_cpu::set_exception_stack(char* base, size_t size)
     set_ist_entry(1, base, size);
 }
 
+inline void arch_cpu::set_exception_stack(arch_thread* t)
+{
+    auto& s = t->exception_stack;
+    set_ist_entry(1, s, sizeof(s));
+}
+
 inline void arch_cpu::set_interrupt_stack(arch_thread* t)
 {
     auto& s = t->interrupt_stack;
diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
index 5341b14eb0f7b346cef6195e4bc5a92bb56e3596..dc47850b7758a1a34dcc938f3bd233a182816382 100644
--- a/arch/x64/arch-switch.hh
+++ b/arch/x64/arch-switch.hh
@@ -54,6 +54,7 @@ void thread::switch_to()
     set_fsbase(reinterpret_cast<u64>(_tcb));
     barrier();
     _cpu->arch.set_interrupt_stack(&_arch);
+    _cpu->arch.set_exception_stack(&_arch);
     asm volatile
         ("mov %%rbp, %c[rbp](%0) \n\t"
          "movq $1f, %c[rip](%0) \n\t"
@@ -80,6 +81,7 @@ void thread::switch_to_first()
     current_cpu = _cpu;
     remote_thread_local_var(percpu_base) = _cpu->percpu_base;
     _cpu->arch.set_interrupt_stack(&_arch);
+    _cpu->arch.set_exception_stack(&_arch);
     asm volatile
         ("mov %c[rsp](%0), %%rsp \n\t"
          "mov %c[rbp](%0), %%rbp \n\t"