Skip to content
Snippets Groups Projects
Unverified Commit 9c85ce19 authored by Joakim Nohlgård's avatar Joakim Nohlgård Committed by GitHub
Browse files

Merge pull request #8733 from gebart/pr/matstat

matstat: Integer mathematical statistics library
parents b522d5e6 5c59f6a3
No related branches found
No related tags found
No related merge requests found
/*
* Copyright (C) 2018 Eistec AB
*
* This file is subject to the terms and conditions of the GNU Lesser
* General Public License v2.1. See the file LICENSE in the top level
* directory for more details.
*/
/**
* @defgroup sys_matstat Matstat - Integer mathematical statistics library
* @ingroup sys
* @brief Library for computing 1-pass statistics
*
* The Matstat library uses single pass algorithms to compute statistic measures
* such as mean and variance over many values. The values can be immediately
* discarded after processing, keeping the memory requirement constant
* regardless of how many values need to be processed.
*
* The design goal is to provide basic mathematical statistics operations on
* constrained devices with a "good enough" accuracy to be able to provide some
* descriptive measures of data. For more accurate measures of statistics, use a
* fancier library, or copy the data to a PC.
*
* It is important to know that using integer operations will result in lower
* precision in the computed measures because of truncation.
*
* @{
* @file
* @brief Matstat library declarations
*
* @author Joakim Nohlgård <joakim.nohlgard@eistec.se>
*/
#ifndef MATSTAT_H
#define MATSTAT_H
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief Internal state for computing running statistics
*/
typedef struct {
int64_t sum; /**< Sum of values added */
uint64_t sum_sq; /**< Sum of squared differences */
uint32_t count; /**< Number of values added */
int32_t mean; /**< Mean value */
int32_t min; /**< Minimum value seen */
int32_t max; /**< Maximum value seen */
} matstat_state_t;
/**
* @brief Empty state initializer
*/
#define MATSTAT_STATE_INIT (const matstat_state_t) { \
.sum = 0, \
.sum_sq = 0, \
.count = 0, \
.mean = 0, \
.min = INT32_MAX, \
.max = INT32_MIN, \
}
/**
* @brief Reset state
*
* @param[in] state State struct to clear
*/
void matstat_clear(matstat_state_t *state);
/**
* @brief Add a sample to state
*
* @param[in] state State struct to operate on
* @param[in] value Value to add to the state
*/
void matstat_add(matstat_state_t *state, int32_t value);
/**
* @brief Return the computed mean value of all samples so far
*
* @param[in] state State struct to operate on
*
* @return arithmetic mean
*/
static inline int32_t matstat_mean(const matstat_state_t *state)
{
return state->mean;
}
/**
* @brief Compute the sample variance of all samples so far
*
* @param[in] state State struct to operate on
*
* @return sample variance
*/
uint64_t matstat_variance(const matstat_state_t *state);
/**
* @brief Combine two states
*
* Add the sums and count of @p src and @p dest, take the maximum of the max
* values and minimum of the min values. The result is written to @p dest.
*
* @param[inout] dest destination state struct
* @param[out] src source state struct
*/
void matstat_merge(matstat_state_t *dest, const matstat_state_t *src);
#ifdef __cplusplus
}
#endif
#endif /* MATSTAT_H */
/** @} */
include $(RIOTBASE)/Makefile.base
/*
* Copyright (C) 2018 Eistec AB
*
* This file is subject to the terms and conditions of the GNU Lesser
* General Public License v2.1. See the file LICENSE in the top level
* directory for more details.
*/
#include <stdint.h>
#include "matstat.h"
#define ENABLE_DEBUG (0)
#include "debug.h"
void matstat_clear(matstat_state_t *state)
{
*state = MATSTAT_STATE_INIT;
}
void matstat_add(matstat_state_t *state, int32_t value)
{
if (value > state->max) {
state->max = value;
}
if (value < state->min) {
state->min = value;
}
state->sum += value;
/* Using Welford's algorithm for variance */
++state->count;
if (state->count == 1) {
state->sum_sq = 0;
state->mean = value;
}
else {
int32_t new_mean = state->sum / state->count;
int64_t diff = (value - state->mean) * (value - new_mean);
if ((diff < 0) && ((uint64_t)(-diff) > state->sum_sq)) {
/* Handle corner cases where sum_sq becomes negative */
state->sum_sq = 0;
}
else {
state->sum_sq += diff;
}
state->mean = new_mean;
}
}
uint64_t matstat_variance(const matstat_state_t *state)
{
if (state->count < 2) {
/* We don't have any way of returning an error */
return 0;
}
uint64_t variance = state->sum_sq / (state->count - 1);
DEBUG("Var: (%" PRIu64 " / (%" PRId32 " - 1)) = %" PRIu64 "\n",
state->sum_sq, state->count, variance);
return variance;
}
void matstat_merge(matstat_state_t *dest, const matstat_state_t *src)
{
if (src->count == 0) {
/* src is empty, no-op */
return;
}
if (dest->count == 0) {
/* dest is empty, straight copy */
*dest = *src;
return;
}
/* Combining the variance of the two samples needs some extra
* handling if the means are different between the two states,
* source: https://stats.stackexchange.com/a/43183
* (using sum_sq = sigma2 * n, instead of sum_sq = sigma2 * (n-1) to simplify algorithm)
*/
dest->sum_sq = (dest->sum_sq + dest->sum * dest->mean + src->sum_sq + src->sum * src->mean);
dest->count += src->count;
dest->sum += src->sum;
int32_t new_mean = dest->sum / dest->count;
int64_t diff = -new_mean * dest->sum;
if ((diff < 0) && ((uint64_t)(-diff) > dest->sum_sq)) {
/* Handle corner cases where sum_sq becomes negative */
dest->sum_sq = 0;
}
else {
dest->sum_sq += diff;
}
dest->mean = new_mean;
if (src->max > dest->max) {
dest->max = src->max;
}
if (src->min < dest->min) {
dest->min = src->min;
}
}
include $(RIOTBASE)/Makefile.base
USEMODULE += matstat
/*
* Copyright (C) 2018 Eistec AB
*
* This file is subject to the terms and conditions of the GNU Lesser
* General Public License v2.1. See the file LICENSE in the top level
* directory for more details.
*/
#include <string.h>
#include "embUnit.h"
#include "tests-matstat.h"
#include "matstat.h"
#define ENABLE_DEBUG (0)
#include "debug.h"
/* White box testing of matstat library */
static void test_matstat_basic(void)
{
/* nothing special, only verifying the basic functionality */
matstat_state_t state = MATSTAT_STATE_INIT;
matstat_add(&state, 10);
TEST_ASSERT_EQUAL_INT(10, state.min);
TEST_ASSERT_EQUAL_INT(10, state.max);
TEST_ASSERT_EQUAL_INT(1, state.count);
matstat_add(&state, 20);
TEST_ASSERT_EQUAL_INT(10, state.min);
TEST_ASSERT_EQUAL_INT(20, state.max);
TEST_ASSERT_EQUAL_INT(2, state.count);
matstat_add(&state, 30);
TEST_ASSERT_EQUAL_INT(10, state.min);
TEST_ASSERT_EQUAL_INT(30, state.max);
TEST_ASSERT_EQUAL_INT(3, state.count);
matstat_add(&state, 40);
TEST_ASSERT_EQUAL_INT(10, state.min);
TEST_ASSERT_EQUAL_INT(40, state.max);
TEST_ASSERT_EQUAL_INT(4, state.count);
int32_t mean = matstat_mean(&state);
TEST_ASSERT_EQUAL_INT(25, mean);
uint64_t var = matstat_variance(&state);
TEST_ASSERT_EQUAL_INT(166, var);
matstat_clear(&state);
TEST_ASSERT_EQUAL_INT(0, state.count);
}
static void test_matstat_var_stability(void)
{
/* This test is designed to detect stability errors where the values are
* located very close together, which should yield a very low variance. */
/* The initial implementation of the variance algorithm resulted in a very
* large variance in this test, due to cancellation problems */
matstat_state_t state = MATSTAT_STATE_INIT;
matstat_add(&state, 999999);
matstat_add(&state, 1000000);
matstat_add(&state, 1000000);
matstat_add(&state, 1000000);
matstat_add(&state, 1000000);
matstat_add(&state, 1000000);
matstat_add(&state, 1000000);
matstat_add(&state, 1000000);
matstat_add(&state, 1000000);
matstat_add(&state, 1000000);
int32_t mean = matstat_mean(&state);
TEST_ASSERT(mean >= 999999);
TEST_ASSERT(mean <= 1000000);
uint64_t var = matstat_variance(&state);
TEST_ASSERT(var <= 1);
}
static void test_matstat_negative_variance(void)
{
/* This is a regression test for two related problems where the truncation
* in the mean computation (integer division) causes the sum_sq value to become
* negative, or the variance itself to become negative */
matstat_state_t state = MATSTAT_STATE_INIT;
matstat_add(&state, -1);
matstat_add(&state, 0);
uint64_t var = matstat_variance(&state);
TEST_ASSERT_EQUAL_INT(0, var);
matstat_clear(&state);
matstat_add(&state, 1);
matstat_add(&state, 0);
matstat_add(&state, 0);
matstat_add(&state, 0);
var = matstat_variance(&state);
TEST_ASSERT_EQUAL_INT(0, var);
matstat_clear(&state);
matstat_add(&state, 1234567);
for (unsigned int k = 0; k < 9999; ++k) {
matstat_add(&state, 1234567);
matstat_add(&state, 1234566);
}
var = matstat_variance(&state);
TEST_ASSERT_EQUAL_INT(0, var);
}
static void test_matstat_merge_basic(void)
{
/* This is a basic test of the merging functionality without any "special" cases */
matstat_state_t state1 = MATSTAT_STATE_INIT;
matstat_state_t state2 = MATSTAT_STATE_INIT;
matstat_state_t state_ref = MATSTAT_STATE_INIT;
matstat_add(&state1, 2000);
matstat_add(&state_ref, 2000);
matstat_add(&state1, 1000);
matstat_add(&state_ref, 1000);
matstat_add(&state1, 2000);
matstat_add(&state_ref, 2000);
matstat_add(&state2, 2000);
matstat_add(&state_ref, 2000);
matstat_add(&state2, 2456);
matstat_add(&state_ref, 2456);
matstat_add(&state2, 1234);
matstat_add(&state_ref, 1234);
matstat_add(&state2, 5678);
matstat_add(&state_ref, 5678);
matstat_add(&state2, 9999);
matstat_add(&state_ref, 9999);
matstat_merge(&state1, &state2);
TEST_ASSERT_EQUAL_INT(state_ref.min, state1.min);
TEST_ASSERT_EQUAL_INT(state_ref.max, state1.max);
TEST_ASSERT_EQUAL_INT(state_ref.count, state1.count);
TEST_ASSERT_EQUAL_INT(state_ref.sum, state1.sum);
int32_t mean = matstat_mean(&state1);
int32_t mean_ref = matstat_mean(&state_ref);
TEST_ASSERT_EQUAL_INT(mean_ref, mean);
}
static void test_matstat_merge_empty(void)
{
/* Testing merging with one or more empty states */
matstat_state_t state1 = MATSTAT_STATE_INIT;
matstat_state_t state2 = MATSTAT_STATE_INIT;
matstat_merge(&state1, &state2);
TEST_ASSERT_EQUAL_INT(0, state1.count);
TEST_ASSERT_EQUAL_INT(0, state2.count);
TEST_ASSERT_EQUAL_INT(0, state1.sum);
TEST_ASSERT_EQUAL_INT(0, state2.sum);
TEST_ASSERT_EQUAL_INT(0, state1.sum_sq);
TEST_ASSERT_EQUAL_INT(0, state2.sum_sq);
matstat_add(&state1, 2000);
matstat_add(&state1, 1000);
matstat_add(&state1, 2000);
matstat_merge(&state1, &state2);
TEST_ASSERT_EQUAL_INT(1000, state1.min);
TEST_ASSERT_EQUAL_INT(2000, state1.max);
TEST_ASSERT_EQUAL_INT(3, state1.count);
TEST_ASSERT_EQUAL_INT(0, state2.count);
matstat_clear(&state1);
TEST_ASSERT_EQUAL_INT(0, state1.count);
matstat_add(&state2, 2000);
matstat_add(&state2, 1000);
matstat_add(&state2, 2000);
TEST_ASSERT_EQUAL_INT(3, state2.count);
matstat_merge(&state1, &state2);
TEST_ASSERT_EQUAL_INT(1000, state1.min);
TEST_ASSERT_EQUAL_INT(2000, state1.max);
TEST_ASSERT_EQUAL_INT(3, state1.count);
}
static void test_matstat_merge_variance(void)
{
/* This test should ensure that merging states from separate sequences will
* yield correct results for the variance computation */
matstat_state_t state1 = MATSTAT_STATE_INIT;
matstat_state_t state2 = MATSTAT_STATE_INIT;
matstat_state_t state_ref = MATSTAT_STATE_INIT;
matstat_add(&state1, 2000);
matstat_add(&state_ref, 2000);
matstat_add(&state1, 1000);
matstat_add(&state_ref, 1000);
matstat_add(&state1, 2000);
matstat_add(&state_ref, 2000);
matstat_add(&state2, 9999);
matstat_add(&state_ref, 9999);
matstat_add(&state2, 2456);
matstat_add(&state_ref, 2456);
matstat_add(&state2, 1234);
matstat_add(&state_ref, 1234);
matstat_add(&state2, 5678);
matstat_add(&state_ref, 5678);
matstat_add(&state2, 9999);
matstat_add(&state_ref, 9999);
matstat_merge(&state1, &state2);
uint64_t var = matstat_variance(&state1);
uint64_t var_ref = matstat_variance(&state_ref);
int64_t var_diff = var - var_ref;
/* There will invariably be some loss of accuracy because of the integer
* operations involved in the variance computation. */
TEST_ASSERT(var_diff < 1000);
TEST_ASSERT(var_diff > -1000);
TEST_ASSERT_EQUAL_INT(state_ref.mean, state1.mean);
}
static void test_matstat_merge_variance_regr1(void)
{
/* This is a regression check for an issue where the sum_sq variable became
* negative after merging a sequence of states with different means, and
* small but non-zero sum_sq values. */
/* Numbers were taken from a stats dump from the bench_timers application */
matstat_state_t inputs[] = {
{ .count = 2686, .sum = 5414, .sum_sq = 1380, .min = 1, .max = 3, .mean = 2 },
{ .count = 2643, .sum = 5272, .sum_sq = 3263, .min = 1, .max = 3, .mean = 1 },
{ .count = 2650, .sum = 5328, .sum_sq = 719, .min = 1, .max = 3, .mean = 2 },
{ .count = 2562, .sum = 5117, .sum_sq = 2756, .min = 1, .max = 3, .mean = 1 },
{ .count = 2579, .sum = 5157, .sum_sq = 635, .min = 1, .max = 3, .mean = 1 },
{ .count = 2533, .sum = 5050, .sum_sq = 2944, .min = 1, .max = 3, .mean = 1 },
{ .count = 2630, .sum = 5276, .sum_sq = 1078, .min = 1, .max = 3, .mean = 2 },
{ .count = 2667, .sum = 5333, .sum_sq = 974, .min = 1, .max = 3, .mean = 1 },
{ .count = 2414, .sum = 4859, .sum_sq = 1074, .min = 1, .max = 3, .mean = 2 },
};
matstat_state_t merged = MATSTAT_STATE_INIT;
for (unsigned k = 0; k < sizeof(inputs) / sizeof(inputs[0]); ++k) {
matstat_merge(&merged, &inputs[k]);
}
int64_t var = (int64_t)matstat_variance(&merged);
/* Expected variance for this input is 0, because of integer truncation of the result.
* The bug gave the following result instead:
* count = 23364, sum = 46806, sum_sq = 18446744073709540510, mean = 2, var = 789570863061659
*/
/* Left here for debugging test case failures: */
/* printf("\nmerged: count = %" PRIu32 ", sum = %" PRId64 ", sum_sq = %" PRIu64 ", "
"mean = %" PRId32 ", var = %" PRIu64 "\n", merged.count, merged.sum,
merged.sum_sq, merged.mean, var); */
TEST_ASSERT((int64_t)merged.sum_sq > 0);
TEST_ASSERT(var >= 0);
}
static void test_matstat_accuracy(void)
{
/* This test verifies that the numeric accuracy is "good enough" */
matstat_state_t state = MATSTAT_STATE_INIT;
/*
* The test values below were sampled from a normal distribution with
* mean = 12345
* standard deviation = 10000 => variance = 100000000
*
* The sample distribution, when computed with double precision floating
* point values, is:
* sample variance = 115969073.207895
* sample mean = 12293.05
*/
/* This test will fail unless the library adaptively adjusts the offset to
* reduce the error in the variance */
matstat_add(&state, -9228);
matstat_add(&state, 6225);
matstat_add(&state, 15935);
matstat_add(&state, 1171);
matstat_add(&state, 9500);
matstat_add(&state, 22805);
matstat_add(&state, 6484);
matstat_add(&state, 10157);
matstat_add(&state, 23870);
matstat_add(&state, 9010);
matstat_add(&state, 16093);
matstat_add(&state, 20969);
matstat_add(&state, 18077);
matstat_add(&state, 9202);
matstat_add(&state, 20074);
matstat_add(&state, 19236);
matstat_add(&state, 32276);
matstat_add(&state, 6342);
matstat_add(&state, 18759);
matstat_add(&state, -11096);
int32_t mean = matstat_mean(&state);
uint64_t var = matstat_variance(&state);
int64_t var_diff = var - 115969073;
TEST_ASSERT(var_diff < 10000);
TEST_ASSERT(var_diff > -10000);
TEST_ASSERT_EQUAL_INT(12293, mean);
}
Test *tests_matstat_tests(void)
{
EMB_UNIT_TESTFIXTURES(fixtures) {
new_TestFixture(test_matstat_basic),
new_TestFixture(test_matstat_var_stability),
new_TestFixture(test_matstat_merge_basic),
new_TestFixture(test_matstat_merge_empty),
new_TestFixture(test_matstat_merge_variance),
new_TestFixture(test_matstat_merge_variance_regr1),
new_TestFixture(test_matstat_accuracy),
new_TestFixture(test_matstat_negative_variance),
};
EMB_UNIT_TESTCALLER(matstat_tests, NULL, NULL, fixtures);
return (Test *)&matstat_tests;
}
void tests_matstat(void)
{
TESTS_RUN(tests_matstat_tests());
}
/*
* Copyright (C) 2018 Eistec AB
*
* This file is subject to the terms and conditions of the GNU Lesser
* General Public License v2.1. See the file LICENSE in the top level
* directory for more details.
*/
/**
* @addtogroup unittests
* @{
*
* @file
* @brief Unittests for the Matstat library
*
* @author Joakim Nohlgård <joakim.nohlgard@eistec.se>
*/
#ifndef TESTS_MATSTAT_H
#define TESTS_MATSTAT_H
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief The entry point of this test suite.
*/
void tests_matstat(void);
/**
* @brief Generates tests for matstat
*
* @return embUnit tests if successful, NULL if not.
*/
Test *tests_matstat_tests(void);
#ifdef __cplusplus
}
#endif
#endif /* TESTS_MATSTAT_H */
/** @} */
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment