diff --git a/cpu/cortexm_common/thread_arch.c b/cpu/cortexm_common/thread_arch.c
index 7523e45a4de607105c10669074440ed326982d6f..ee878af3b592fe3ca3c2f81068824dff479fa8ac 100644
--- a/cpu/cortexm_common/thread_arch.c
+++ b/cpu/cortexm_common/thread_arch.c
@@ -148,22 +148,6 @@ char *thread_stack_init(thread_task_func_t task_func,
         *stk = ~((uint32_t)STACK_MARKER);
     }
 
-#if defined(CPU_ARCH_CORTEX_M4F) || (CPU_ARCH_CORTEX_M7)
-    /* TODO: fix FPU handling for Cortex-M4f */
-    /*
-    stk--;
-    *stk = (unsigned int) 0;
-    */
-
-    /* S0 - S15 */
-    /*
-    for (int i = 15; i >= 0; i--) {
-        stk--;
-        *stk = i;
-    }
-    */
-#endif
-
     /* ****************************** */
     /* Automatically popped registers */
     /* ****************************** */
@@ -299,7 +283,7 @@ void __attribute__((naked)) __attribute__((used)) isr_pendsv(void) {
     __asm__ volatile (
     /* PendSV handler entry point */
     /* save context by pushing unsaved registers to the stack */
-    /* {r0-r3,r12,LR,PC,xPSR} are saved automatically on exception entry */
+    /* {r0-r3,r12,LR,PC,xPSR,s0-s15,FPSCR} are saved automatically on exception entry */
     ".thumb_func                      \n"
     "mrs    r0, psp                   \n" /* get stack pointer from user mode */
 #if defined(CPU_ARCH_CORTEX_M0) || defined(CPU_ARCH_CORTEX_M0PLUS)
@@ -317,11 +301,13 @@ void __attribute__((naked)) __attribute__((used)) isr_pendsv(void) {
     "mov    r0, sp                    \n" /* switch back to the exception SP */
     "mov    sp, r12                   \n"
 #else
+#if (defined(CPU_ARCH_CORTEX_M4F) || defined(CPU_ARCH_CORTEX_M7)) && defined(MODULE_CORTEXM_FPU)
+    "tst    lr, #0x10                 \n"
+    "it     eq                        \n"
+    "vstmdbeq r0!, {s16-s31}          \n" /* save FPU registers if FPU is used */
+#endif
     "stmdb  r0!,{r4-r11}              \n" /* save regs */
     "stmdb  r0!,{lr}                  \n" /* exception return value */
-#if defined(CPU_ARCH_CORTEX_M4F) || defined(CPU_ARCH_CORTEX_M7)
-/*  "vstmdb sp!, {s16-s31}            \n" */ /* TODO save FPU registers */
-#endif
 #endif
     "ldr    r1, =sched_active_thread  \n" /* load address of current tcb */
     "ldr    r1, [r1]                  \n" /* dereference pdc */
@@ -364,14 +350,16 @@ void __attribute__((naked)) __attribute__((used)) isr_svc(void) {
     "ldr    r0, [r0]                  \n" /* dereference TCB */
     "ldr    r1, [r0]                  \n" /* load tcb->sp to register 1 */
     "ldmia  r1!, {r0}                 \n" /* restore exception return value */
-#if defined(CPU_ARCH_CORTEX_M4F) || defined(CPU_ARCH_CORTEX_M7)
-/*  "pop    {s16-s31}                 \n" */ /* TODO load FPU registers */
-#endif
     "ldmia  r1!, {r4-r11}             \n" /* restore other registers */
+#if (defined(CPU_ARCH_CORTEX_M4F) || defined(CPU_ARCH_CORTEX_M7)) && defined(MODULE_CORTEXM_FPU)
+    "tst    r0, #0x10                 \n"
+    "it     eq                        \n"
+    "vldmiaeq r1!, {s16-s31}          \n" /* load FPU registers if saved */
+#endif
     "msr    psp, r1                   \n" /* restore user mode SP to PSP reg */
     "bx     r0                        \n" /* load exception return value to PC,
                                            * causes end of exception*/
 #endif
-    /* {r0-r3,r12,LR,PC,xPSR} are restored automatically on exception return */
+    /* {r0-r3,r12,LR,PC,xPSR,s0-s15,FPSCR} are restored automatically on exception return */
     );
 }
diff --git a/makefiles/arch/cortexm.inc.mk b/makefiles/arch/cortexm.inc.mk
index 6a29f27a1974259e78238876bc1fe32330f7dcb8..617afc5ad261dc6fbb3685418455779c4d643983 100644
--- a/makefiles/arch/cortexm.inc.mk
+++ b/makefiles/arch/cortexm.inc.mk
@@ -63,14 +63,29 @@ ARCH = $(shell echo $(CPU_ARCH) | tr 'a-z-' 'A-Z_')
 export CFLAGS += -DCPU_ARCH_$(ARCH)
 
 # set the compiler specific CPU and FPU options
-ifeq ($(CPU_ARCH),cortex-m4f)
-# TODO: enable hard floating points for the M4F once the context save/restore
-#       code is adjusted to take care of FPU registers
-#export CFLAGS_FPU += -mfloat-abi=hard -mfpu=fpv4-sp-d16
-export MCPU := cortex-m4
-endif
+ifneq (,$(filter $(CPU_ARCH),cortex-m4f cortex-m7))
+    ifneq (,$(filter cortexm_fpu,$(DISABLE_MODULE)))
+        export CFLAGS_FPU ?= -mfloat-abi=soft
+    else
+        USEMODULE += cortexm_fpu
+        # clang assumes there is an FPU
+        ifneq (llvm,$(TOOLCHAIN))
+            ifeq ($(CPU_ARCH),cortex-m7)
+                export CFLAGS_FPU ?= -mfloat-abi=hard -mfpu=fpv5-d16
+            else
+                export CFLAGS_FPU ?= -mfloat-abi=hard -mfpu=fpv4-sp-d16
+            endif
+        endif
+    endif
+    ifeq ($(CPU_ARCH),cortex-m4f)
+        export MCPU := cortex-m4
+    else
+        export MCPU ?= $(CPU_ARCH)
+    endif
+else
 CFLAGS_FPU ?= -mfloat-abi=soft
 export MCPU ?= $(CPU_ARCH)
+endif
 
 # CMSIS DSP needs to know about the CPU core
 ifneq (,$(filter cmsis-dsp,$(USEPKG)))
diff --git a/makefiles/pseudomodules.inc.mk b/makefiles/pseudomodules.inc.mk
index 497dbdb71e65256f33c13dac59680797e643cad8..3de942f1657ba9809a7d36015d64f2825aefc1a3 100644
--- a/makefiles/pseudomodules.inc.mk
+++ b/makefiles/pseudomodules.inc.mk
@@ -8,6 +8,7 @@ PSEUDOMODULES += conn_can_isotp_multi
 PSEUDOMODULES += cord_ep_standalone
 PSEUDOMODULES += cord_epsim_standalone
 PSEUDOMODULES += core_%
+PSEUDOMODULES += cortexm_fpu
 PSEUDOMODULES += ecc_%
 PSEUDOMODULES += emb6_router
 PSEUDOMODULES += event_%
diff --git a/pkg/cmsis-dsp/patches/0001-include-cpu_conf.h.patch b/pkg/cmsis-dsp/patches/0001-include-cpu_conf.h.patch
new file mode 100644
index 0000000000000000000000000000000000000000..7064e4624dc8e91613a144ef9cbbdfa55f71186e
Binary files /dev/null and b/pkg/cmsis-dsp/patches/0001-include-cpu_conf.h.patch differ
diff --git a/tests/thread_float/Makefile b/tests/thread_float/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..aa1ac8ea44b9c03d8a253525352a228806696baa
--- /dev/null
+++ b/tests/thread_float/Makefile
@@ -0,0 +1,16 @@
+APPLICATION = thread_float
+include ../Makefile.tests_common
+
+BOARD_INSUFFICIENT_MEMORY := airfy-beacon arduino-uno arduino-duemilanove \
+			     alliope-mini cc2650stk chronos maple-mini \
+                             mbed_lpc1768 microbit msb-430 msb-430h nrf51dongle \
+                             nrf6310 nucleo-f031k6 nucleo-f042k6 \
+                             opencm9-04 pca10000 pca10005 spark-core \
+                             stm32f0discovery weio yunjia-nrf51822
+
+USEMODULE += printf_float
+USEMODULE += xtimer
+
+#DISABLE_MODULE += cortexm_fpu
+
+include $(RIOTBASE)/Makefile.include
diff --git a/tests/thread_float/main.c b/tests/thread_float/main.c
new file mode 100644
index 0000000000000000000000000000000000000000..2c7b15d2a7873c05c7db5bc7a4fd47b60a6d03ad
--- /dev/null
+++ b/tests/thread_float/main.c
@@ -0,0 +1,110 @@
+/*
+ * Copyright (C) 2017 OTA keys S.A.
+ *
+ * This file is subject to the terms and conditions of the GNU Lesser
+ * General Public License v2.1. See the file LICENSE in the top level
+ * directory for more details.
+ */
+
+/**
+ * @ingroup tests
+ * @{
+ *
+ * @file
+ * @brief Thread test application
+ *
+ * @author Vincent Dupont <vincent@otakeys.com>
+ *
+ * @}
+ */
+
+#include <stdio.h>
+
+#include "thread.h"
+#include "msg.h"
+#include "xtimer.h"
+#include "timex.h"
+
+static char t1_stack[THREAD_STACKSIZE_MAIN];
+static char t2_stack[THREAD_STACKSIZE_MAIN];
+static char t3_stack[THREAD_STACKSIZE_MAIN];
+
+static kernel_pid_t p1, p2, p3;
+
+static xtimer_t timer;
+#define OFFSET (10 * XTIMER_BACKOFF)
+
+static mutex_t lock = MUTEX_INIT;
+
+static void timer_cb(void *arg)
+{
+    (void) arg;
+
+    thread_yield();
+    xtimer_set(&timer, OFFSET);
+}
+
+static void *thread1(void *arg)
+{
+    (void) arg;
+
+    float f, init;
+
+    printf("THREAD %" PRIkernel_pid " start\n", thread_getpid());
+
+    init = 1.0 * thread_getpid();
+    f = init;
+
+    while (1) {
+        for (unsigned long i = 0; i < 10000ul; i++) {
+            f = f + 1.0 / f;
+        }
+        mutex_lock(&lock);
+        printf("T(%" PRIkernel_pid "): %f\n", thread_getpid(), (double)f);
+        mutex_unlock(&lock);
+        init += 1.0;
+        f = init;
+    }
+    return NULL;
+}
+
+static void *thread2(void *arg)
+{
+    (void) arg;
+
+    float f, init;
+
+    printf("THREAD %" PRIkernel_pid " start\n", thread_getpid());
+
+    init = 1.0 * thread_getpid();
+    f = init;
+
+    while (1) {
+        for (unsigned long i = 0; i < 100000ul; i++) {
+            f = f + 1.0 / f;
+        }
+        init += 1.0;
+        f = init;
+    }
+    return NULL;
+}
+
+int main(void)
+{
+    p1 = thread_create(t1_stack, sizeof(t1_stack), THREAD_PRIORITY_MAIN + 1,
+                       THREAD_CREATE_WOUT_YIELD | THREAD_CREATE_STACKTEST,
+                       thread1, NULL, "nr1");
+    p2 = thread_create(t2_stack, sizeof(t2_stack), THREAD_PRIORITY_MAIN + 1,
+                       THREAD_CREATE_WOUT_YIELD | THREAD_CREATE_STACKTEST,
+                       thread2, NULL, "nr2");
+    p3 = thread_create(t3_stack, sizeof(t3_stack), THREAD_PRIORITY_MAIN + 1,
+                       THREAD_CREATE_WOUT_YIELD | THREAD_CREATE_STACKTEST,
+                       thread1, NULL, "nr3");
+    puts("THREADS CREATED\n");
+
+
+    timer.callback = timer_cb;
+    xtimer_set(&timer, OFFSET);
+
+    return 0;
+}