moved doc to header and converted it to doxygen

0fb5e89c · Christian Mehlis · 5a45d158 · 0fb5e89c · 0fb5e89c · 0fb5e89c
Commit 0fb5e89c authored 11 years ago by Christian Mehlis
--- a/sys/bloom/bloom.c
+++ b/sys/bloom/bloom.c
-/******************************************************************************
- * bloom.c
- * ```````
- * Bloom filters
+/**
+ * Bloom filter implementation
 *
- * HISTORY
- *                                                   {x,  y,  z}
- * A Bloom filter is a probibalistic                  :   :   :
- * data structure with several interesting           /|\ /|\ /|\
- * properties, such as low memory usage,            / | X | X | \
- * asymmetric query confidence, and a very         /  |/ \|/ \|  \
- * speedy O(k) membership test.                   /   |   |   \   \
- *                                               /   /|  /|\  |\   \
- * Because a Bloom filter can                   .   . . . . . . .   .
- * accept any input that can be       00000000001000101010101010100010000000000
- * hashed effectively (such as                       "    "    "
- * strings), that membership test                     \   |   /
- * tends to draw a crowd. TNSTAAFL, but                \  |  /
- * as caveats go, the Bloom filters' are                \ | /
- * more interesting than incapacitating.                 \|/
- *                                                        :
- * Most notably, it can tell you with certainty          {w}
- * that an item 'i' is *not* a member of set 's',
- * but it can only tell you with some finite
- * probability whether an item 'i' *is* a member
- * of set 's'.
+ * Copyright (C) 2013 Freie Universität Berlin
 *
- * Still, along with the intriguing possibility of using bitwise AND and OR
- * to compute the logical union and intersection of two filters, the cheap
- * cost of adding elements to the filter set, and the low memory requirements,
- * the Bloom filter is a good choice for many applications.
+ * This file subject to the terms and conditions of the GNU Lesser General
+ * Public License. See the file LICENSE in the top level directory for more
+ * details.
 *
- * NOTES
+ * @file
+ * @autor Jason Linehan <patientulysses@gmail.com>
+ * @autor Christian Mehlis <mehlis@inf.fu-berlin.de>
+ * @autor Freie Universität Berlin, Computer Systems & Telematics
 *
- * Let's look more closely at the probability values.
- *
- * Assume that a hash function selects each array position with equal
- * probability. If m is the number of bits in the array, and k is the number
- * of hash functions, then the probability that a certain bit is not set
- * to 1 by a certain hash function during the insertion of an element is
- *
- *      1-(1/m).
- *
- * The probability that it is not set to 1 by any of the hash functions is
- *
- *      (1-(1/m))^k.
- *
- * If we have inserted n elements, the probability that a certain bit is
- * set 0 is
- *
- *      (1-(1/m))^kn,
- *
- * Meaning that the probability said bit is set to 1 is therefore
- *
- *      1-([1-(1/m)]^kn).
- *
- * Now test membership of an element that is not in the set. Each of the k
- * array positions computed by the hash functions is 1 with a probability
- * as above. The probability of all of them being 1, which would cause the
- * algorithm to erroneously claim that the element is in the set, is often
- * given as
- *
- *      (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
- *
- * This is not strictly correct as it assumes independence for the
- * probabilities of each bit being set. However, assuming it is a close
- * approximation we have that the probability of false positives descreases
- * as m (the number of bits in the array) increases, and increases as n
- * (the number of inserted elements) increases. For a given m and n, the
- * value of k (the number of hash functions) that minimizes the probability
- * is
- *
- *      (m/n)ln(2) ~~ 0.7(m/n),
- *
- * which gives the false positive probability of
- *
- *      2^-k ~~ 0.6185^(m/n).
- *
- * The required number of bits m, given n and a desired false positive
- * probability p (and assuming the optimal value of k is used) can be
- * computed by substituting the optimal value of k in the probability
- * expression above:
- *
- *      p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
- *
- * which simplifies to
- *
- *      ln(p) = -(m/n) * (ln2)^2.
- *
- * This results in the equation
- *
- *      m = -((n*ln(p)) / ((ln(2))^2))
- *
- * The classic filter uses
- *
- *       1.44*log2(1/eta)
- *
- * bits of space per inserted key, where eta is the false positive rate of
- * the Bloom filter.
- *
- * AUTHOR
- * Jason Linehan (patientulysses@gmail.com)
- *
- * LICENSE
- * Public domain.
- *
- ******************************************************************************/
+ */

 #include <limits.h>
 #include <stdarg.h>
@@ -112,23 +20,10 @@

 #include "bloom.h"

-
 #define SETBIT(a,n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT)))
 #define GETBIT(a,n) (a[n/CHAR_BIT] &  (1<<(n%CHAR_BIT)))
 #define ROUND(size) ((size + CHAR_BIT - 1) / CHAR_BIT)

-
-/******************************************************************************
- * bloom_new  Allocate and return a pointer to a new Bloom filter.
- * `````````
- * @size  : size of the bit array in the filter
- * @nfuncs: the number of hash functions
- * Returns: An allocated bloom filter
- *
- * USAGE
- * For best results, make 'size' a power of 2.
- *
- ******************************************************************************/
 struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) {
    struct bloom_t *bloom;
    va_list hashes;
@@ -171,14 +66,6 @@ struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) {
    return bloom;
 }

-
-/******************************************************************************
- * bloom_del  Delete a Bloom filter.
- * `````````
- * @bloom : The condemned.
- * Returns: nothing.
- *
- ******************************************************************************/
 void bloom_del(struct bloom_t *bloom)
 {
    free(bloom->a);
@@ -186,18 +73,6 @@ void bloom_del(struct bloom_t *bloom)
    free(bloom);
 }

-
-/******************************************************************************
- * bloom_add  Add a string to a Bloom filter.
- * `````````
- * @bloom : Bloom filter
- * @s     : string to add
- * Returns: nothing.
- *
- * CAVEAT
- * Once a string has been added to the filter, it cannot be "removed"!
- *
- ******************************************************************************/
 void bloom_add(struct bloom_t *bloom, const char *s)
 {
    unsigned int hash;
@@ -209,46 +84,6 @@ void bloom_add(struct bloom_t *bloom, const char *s)
    }
 }

-
-/******************************************************************************
- * bloom_check  Determine if a string is in the Bloom filter.
- * ```````````
- * @bloom : Bloom filter
- * @s     : string to add
- * Returns: false if string does not exist in the filter, otherwise true.
- *
- * NOTES
- *
- * So this is the freakshow that bored programmers pay a nickel to get a
- * peek at, step right up. This is the way the membership test works.
- *
- * The string 's' is hashed once for each of the 'k' hash functions, as
- * though we were planning to add it to the filter. Instead of adding it
- * however, we examine the bit that we *would* have set, and consider its
- * value.
- *
- * If the bit is 1 (set), the string we are hashing may be in the filter,
- * since it would have set this bit when it was originally hashed. However,
- * it may also be that another string just happened to produce a hash value
- * that would also set this bit. That would be a false positive. This is why
- * we have k > 1, so we can minimize the likelihood of false positives
- * occuring.
- *
- * If every bit corresponding to every one of the k hashes of our query
- * string is set, we can say with some probability of being correct that
- * the string we are holding is indeed "in" the filter. However, we can
- * never be sure.
- *
- * If, however, as we hash our string and peek at the resulting bit in the
- * filter, we find the bit is 0 (not set)... well now, that's different.
- * In this case, we can say with absolute certainty that the string we are
- * holding is *not* in the filter, because if it were, this bit would have
- * to be set.
- *
- * In this way, the Bloom filter can answer NO with absolute surety, but
- * can only speak a qualified YES.
- *
- ******************************************************************************/
 bool bloom_check(struct bloom_t *bloom, const char *s)
 {
    unsigned int hash;

--- a/sys/include/bloom.h
+++ b/sys/include/bloom.h
+/**
+ * bloom.c
+ *
+ * Bloom filters
+ *
+ * HISTORY
+ *                                                   {x,  y,  z}
+ * A Bloom filter is a probibalistic                  :   :   :
+ * data structure with several interesting           /|\ /|\ /|\
+ * properties, such as low memory usage,            / | X | X | \
+ * asymmetric query confidence, and a very         /  |/ \|/ \|  \
+ * speedy O(k) membership test.                   /   |   |   \   \
+ *                                               /   /|  /|\  |\   \
+ * Because a Bloom filter can                   .   . . . . . . .   .
+ * accept any input that can be       00000000001000101010101010100010000000000
+ * hashed effectively (such as                       "    "    "
+ * strings), that membership test                     \   |   /
+ * tends to draw a crowd. TNSTAAFL, but                \  |  /
+ * as caveats go, the Bloom filters' are                \ | /
+ * more interesting than incapacitating.                 \|/
+ *                                                        :
+ * Most notably, it can tell you with certainty          {w}
+ * that an item 'i' is *not* a member of set 's',
+ * but it can only tell you with some finite
+ * probability whether an item 'i' *is* a member
+ * of set 's'.
+ *
+ * Still, along with the intriguing possibility of using bitwise AND and OR
+ * to compute the logical union and intersection of two filters, the cheap
+ * cost of adding elements to the filter set, and the low memory requirements,
+ * the Bloom filter is a good choice for many applications.
+ *
+ * NOTES
+ *
+ * Let's look more closely at the probability values.
+ *
+ * Assume that a hash function selects each array position with equal
+ * probability. If m is the number of bits in the array, and k is the number
+ * of hash functions, then the probability that a certain bit is not set
+ * to 1 by a certain hash function during the insertion of an element is
+ *
+ *      1-(1/m).
+ *
+ * The probability that it is not set to 1 by any of the hash functions is
+ *
+ *      (1-(1/m))^k.
+ *
+ * If we have inserted n elements, the probability that a certain bit is
+ * set 0 is
+ *
+ *      (1-(1/m))^kn,
+ *
+ * Meaning that the probability said bit is set to 1 is therefore
+ *
+ *      1-([1-(1/m)]^kn).
+ *
+ * Now test membership of an element that is not in the set. Each of the k
+ * array positions computed by the hash functions is 1 with a probability
+ * as above. The probability of all of them being 1, which would cause the
+ * algorithm to erroneously claim that the element is in the set, is often
+ * given as
+ *
+ *      (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k.
+ *
+ * This is not strictly correct as it assumes independence for the
+ * probabilities of each bit being set. However, assuming it is a close
+ * approximation we have that the probability of false positives descreases
+ * as m (the number of bits in the array) increases, and increases as n
+ * (the number of inserted elements) increases. For a given m and n, the
+ * value of k (the number of hash functions) that minimizes the probability
+ * is
+ *
+ *      (m/n)ln(2) ~~ 0.7(m/n),
+ *
+ * which gives the false positive probability of
+ *
+ *      2^-k ~~ 0.6185^(m/n).
+ *
+ * The required number of bits m, given n and a desired false positive
+ * probability p (and assuming the optimal value of k is used) can be
+ * computed by substituting the optimal value of k in the probability
+ * expression above:
+ *
+ *      p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)),
+ *
+ * which simplifies to
+ *
+ *      ln(p) = -(m/n) * (ln2)^2.
+ *
+ * This results in the equation
+ *
+ *      m = -((n*ln(p)) / ((ln(2))^2))
+ *
+ * The classic filter uses
+ *
+ *       1.44*log2(1/eta)
+ *
+ * bits of space per inserted key, where eta is the false positive rate of
+ * the Bloom filter.
+ *
+ */
+
+/**
+ * @file
+ * @autor Christian Mehlis <mehlis@inf.fu-berlin.de>
+ * @autor Freie Universität Berlin, Computer Systems & Telematics
+ */
+
 #ifndef _BLOOM_FILTER_H
 #define _BLOOM_FILTER_H

@@ -5,8 +113,14 @@
 #include <stdbool.h>
 #include <stdint.h>

+/**
+ * hashfp_t  hash function to use in thee filter
+ */
 typedef unsigned int (*hashfp_t)(const char *);

+/**
+ * struct bloom_t bloom filter object
+ */
 struct bloom_t {
    size_t m;
    size_t k;
@@ -14,9 +128,77 @@ struct bloom_t {
    hashfp_t *hash;
 };

+/**
+ * bloom_new  Allocate and return a pointer to a new Bloom filter.
+ *
+ * For best results, make 'size' a power of 2.
+ *
+ * @param size        size of the bit array in the filter
+ * @param num_hashes  the number of hash functions
+ * @param functions   varg function pointers, use hashfp_t
+ *
+ * @return An allocated bloom filter
+ *
+ */
 struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...);
+
+/**
+ * bloom_del  Delete a Bloom filter.
+ *
+ * @param bloom The condemned
+ * @return nothing
+ *
+ */
 void bloom_del(struct bloom_t *bloom);
+
+/**
+ * bloom_add  Add a string to a Bloom filter.
+ *
+ * CAVEAT
+ * Once a string has been added to the filter, it cannot be "removed"!
+ *
+ * @param bloom  Bloom filter
+ * @param s      string to add
+ * @return       nothing
+ *
+ */
 void bloom_add(struct bloom_t *bloom, const char *s);
+
+/**
+ * bloom_check  Determine if a string is in the Bloom filter.
+ *
+ * The string 's' is hashed once for each of the 'k' hash functions, as
+ * though we were planning to add it to the filter. Instead of adding it
+ * however, we examine the bit that we *would* have set, and consider its
+ * value.
+ *
+ * If the bit is 1 (set), the string we are hashing may be in the filter,
+ * since it would have set this bit when it was originally hashed. However,
+ * it may also be that another string just happened to produce a hash value
+ * that would also set this bit. That would be a false positive. This is why
+ * we have k > 1, so we can minimize the likelihood of false positives
+ * occuring.
+ *
+ * If every bit corresponding to every one of the k hashes of our query
+ * string is set, we can say with some probability of being correct that
+ * the string we are holding is indeed "in" the filter. However, we can
+ * never be sure.
+ *
+ * If, however, as we hash our string and peek at the resulting bit in the
+ * filter, we find the bit is 0 (not set)... well now, that's different.
+ * In this case, we can say with absolute certainty that the string we are
+ * holding is *not* in the filter, because if it were, this bit would have
+ * to be set.
+ *
+ * In this way, the Bloom filter can answer NO with absolute surety, but
+ * can only speak a qualified YES.
+ *
+ * @param bloom  Bloom filter
+ * @param s      string to check
+ * @return       false if string does not exist in the filter
+ * @return       true if string is may be in the filter
+ *
+ */
 bool bloom_check(struct bloom_t *bloom, const char *s);

 #endif
--- a/sys/include/hashes.h
+++ b/sys/include/hashes.h
-/******************************************************************************
+/**
+ * This file contains some simple hash function
+ *
+ * Copyright (C) 2013 Freie Universität Berlin
+ *
+ * This file subject to the terms and conditions of the GNU Lesser General
+ * Public License. See the file LICENSE in the top level directory for more
+ * details.
+ */
+
+/**
+ * @file
+ * @autor       Jason Linehan <patientulysses@gmail.com>
+ * @author      Freie Universität Berlin, Computer Systems & Telematics
+ * @author      Christian Mehlis <mehlis@inf.fu-berlin.de>
+ */
+
+/**
 * djb2_hash
- * `````````
+ *
 * HISTORY
 * This algorithm (k=33) was first reported by Dan Bernstein many years
 * ago in comp.lang.c. Another version of this algorithm (now favored by
@@ -10,8 +27,7 @@
 *
 * The magic of number 33 (why it works better than many other constants,
 * prime or not) has never been adequately explained.
- *
- ******************************************************************************/
+ */
 static inline unsigned long djb2_hash(const char *str)
 {
    unsigned long hash;
@@ -26,9 +42,9 @@ static inline unsigned long djb2_hash(const char *str)
    return hash;
 }

-/******************************************************************************
+/**
 * sdbm_hash
- * `````````
+ *
 * HISTORY
 * This algorithm was created for sdbm (a public-domain reimplementation
 * of ndbm) database library. It was found to do well in scrambling bits,
@@ -45,7 +61,7 @@ static inline unsigned long djb2_hash(const char *str)
 * out to be a prime. this is one of the algorithms used in berkeley db
 * (see sleepycat) and elsewhere.
 *
- ******************************************************************************/
+ */
 static inline unsigned long sdbm_hash(const char *str)
 {
    unsigned long hash;
@@ -60,9 +76,9 @@ static inline unsigned long sdbm_hash(const char *str)
    return hash;
 }

-/******************************************************************************
+/**
 * lose lose
- * `````````
+ *
 * HISTORY
 * This hash function appeared in K&R (1st ed) but at least the reader
 * was warned:
@@ -78,8 +94,7 @@ static inline unsigned long sdbm_hash(const char *str)
 * checking something like Knuth's Sorting and Searching, so it stuck.
 * It is now found mixed with otherwise respectable code, eg. cnews. sigh.
 * [see also: tpop]
- *
- ******************************************************************************/
+ */
 static inline unsigned long kr_hash(const char *str)
 {
    unsigned int hash;
@@ -94,12 +109,11 @@ static inline unsigned long kr_hash(const char *str)
    return hash;
 }

-/******************************************************************************
+/**
 * sax_hash
- * ````````
- * Shift, Add, XOR
 *
- ******************************************************************************/
+ * Shift, Add, XOR
+ */
 static inline unsigned int sax_hash(const char *key)
 {
    unsigned int h;
@@ -114,14 +128,13 @@ static inline unsigned int sax_hash(const char *key)
 }


-/******************************************************************************
+/**
 * dek_hash
- * ````````
+ *
 * HISTORY
 * Proposed by Donald E. Knuth in The Art Of Computer Programming Vol. 3,
 * under the topic of "Sorting and Search", Chapter 6.4.
- *
- ******************************************************************************/
+ */
 static inline unsigned int dek_hash(const char *str, unsigned int len)
 {
    unsigned int hash;
@@ -138,13 +151,12 @@ static inline unsigned int dek_hash(const char *str, unsigned int len)
 }


-/******************************************************************************
+/**
 * fnv_hash
- * ````````
+ *
 * NOTE
 * For a more fully featured and modern version of this hash, see fnv32.c
- *
- ******************************************************************************/
+ */
 static inline unsigned int fnv_hash(const char *str)
 {
 #define FNV_PRIME 0x811C9DC5