From 0fb5e89c893a14ba4a89b572bdca4a8d00fd69df Mon Sep 17 00:00:00 2001 From: Christian Mehlis <mehlis@inf.fu-berlin.de> Date: Sat, 10 Aug 2013 12:06:09 +0200 Subject: [PATCH] moved doc to header and converted it to doxygen --- sys/bloom/bloom.c | 187 +++---------------------------------------- sys/include/bloom.h | 182 +++++++++++++++++++++++++++++++++++++++++ sys/include/hashes.h | 58 ++++++++------ 3 files changed, 228 insertions(+), 199 deletions(-) diff --git a/sys/bloom/bloom.c b/sys/bloom/bloom.c index ef12f4ec5d..941a16e3e5 100644 --- a/sys/bloom/bloom.c +++ b/sys/bloom/bloom.c @@ -1,110 +1,18 @@ -/****************************************************************************** - * bloom.c - * ``````` - * Bloom filters +/** + * Bloom filter implementation * - * HISTORY - * {x, y, z} - * A Bloom filter is a probibalistic : : : - * data structure with several interesting /|\ /|\ /|\ - * properties, such as low memory usage, / | X | X | \ - * asymmetric query confidence, and a very / |/ \|/ \| \ - * speedy O(k) membership test. / | | \ \ - * / /| /|\ |\ \ - * Because a Bloom filter can . . . . . . . . . - * accept any input that can be 00000000001000101010101010100010000000000 - * hashed effectively (such as " " " - * strings), that membership test \ | / - * tends to draw a crowd. TNSTAAFL, but \ | / - * as caveats go, the Bloom filters' are \ | / - * more interesting than incapacitating. \|/ - * : - * Most notably, it can tell you with certainty {w} - * that an item 'i' is *not* a member of set 's', - * but it can only tell you with some finite - * probability whether an item 'i' *is* a member - * of set 's'. + * Copyright (C) 2013 Freie Universität Berlin * - * Still, along with the intriguing possibility of using bitwise AND and OR - * to compute the logical union and intersection of two filters, the cheap - * cost of adding elements to the filter set, and the low memory requirements, - * the Bloom filter is a good choice for many applications. + * This file subject to the terms and conditions of the GNU Lesser General + * Public License. See the file LICENSE in the top level directory for more + * details. * - * NOTES + * @file + * @autor Jason Linehan <patientulysses@gmail.com> + * @autor Christian Mehlis <mehlis@inf.fu-berlin.de> + * @autor Freie Universität Berlin, Computer Systems & Telematics * - * Let's look more closely at the probability values. - * - * Assume that a hash function selects each array position with equal - * probability. If m is the number of bits in the array, and k is the number - * of hash functions, then the probability that a certain bit is not set - * to 1 by a certain hash function during the insertion of an element is - * - * 1-(1/m). - * - * The probability that it is not set to 1 by any of the hash functions is - * - * (1-(1/m))^k. - * - * If we have inserted n elements, the probability that a certain bit is - * set 0 is - * - * (1-(1/m))^kn, - * - * Meaning that the probability said bit is set to 1 is therefore - * - * 1-([1-(1/m)]^kn). - * - * Now test membership of an element that is not in the set. Each of the k - * array positions computed by the hash functions is 1 with a probability - * as above. The probability of all of them being 1, which would cause the - * algorithm to erroneously claim that the element is in the set, is often - * given as - * - * (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k. - * - * This is not strictly correct as it assumes independence for the - * probabilities of each bit being set. However, assuming it is a close - * approximation we have that the probability of false positives descreases - * as m (the number of bits in the array) increases, and increases as n - * (the number of inserted elements) increases. For a given m and n, the - * value of k (the number of hash functions) that minimizes the probability - * is - * - * (m/n)ln(2) ~~ 0.7(m/n), - * - * which gives the false positive probability of - * - * 2^-k ~~ 0.6185^(m/n). - * - * The required number of bits m, given n and a desired false positive - * probability p (and assuming the optimal value of k is used) can be - * computed by substituting the optimal value of k in the probability - * expression above: - * - * p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)), - * - * which simplifies to - * - * ln(p) = -(m/n) * (ln2)^2. - * - * This results in the equation - * - * m = -((n*ln(p)) / ((ln(2))^2)) - * - * The classic filter uses - * - * 1.44*log2(1/eta) - * - * bits of space per inserted key, where eta is the false positive rate of - * the Bloom filter. - * - * AUTHOR - * Jason Linehan (patientulysses@gmail.com) - * - * LICENSE - * Public domain. - * - ******************************************************************************/ + */ #include <limits.h> #include <stdarg.h> @@ -112,23 +20,10 @@ #include "bloom.h" - #define SETBIT(a,n) (a[n/CHAR_BIT] |= (1<<(n%CHAR_BIT))) #define GETBIT(a,n) (a[n/CHAR_BIT] & (1<<(n%CHAR_BIT))) #define ROUND(size) ((size + CHAR_BIT - 1) / CHAR_BIT) - -/****************************************************************************** - * bloom_new Allocate and return a pointer to a new Bloom filter. - * ````````` - * @size : size of the bit array in the filter - * @nfuncs: the number of hash functions - * Returns: An allocated bloom filter - * - * USAGE - * For best results, make 'size' a power of 2. - * - ******************************************************************************/ struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) { struct bloom_t *bloom; va_list hashes; @@ -171,14 +66,6 @@ struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...) { return bloom; } - -/****************************************************************************** - * bloom_del Delete a Bloom filter. - * ````````` - * @bloom : The condemned. - * Returns: nothing. - * - ******************************************************************************/ void bloom_del(struct bloom_t *bloom) { free(bloom->a); @@ -186,18 +73,6 @@ void bloom_del(struct bloom_t *bloom) free(bloom); } - -/****************************************************************************** - * bloom_add Add a string to a Bloom filter. - * ````````` - * @bloom : Bloom filter - * @s : string to add - * Returns: nothing. - * - * CAVEAT - * Once a string has been added to the filter, it cannot be "removed"! - * - ******************************************************************************/ void bloom_add(struct bloom_t *bloom, const char *s) { unsigned int hash; @@ -209,46 +84,6 @@ void bloom_add(struct bloom_t *bloom, const char *s) } } - -/****************************************************************************** - * bloom_check Determine if a string is in the Bloom filter. - * ``````````` - * @bloom : Bloom filter - * @s : string to add - * Returns: false if string does not exist in the filter, otherwise true. - * - * NOTES - * - * So this is the freakshow that bored programmers pay a nickel to get a - * peek at, step right up. This is the way the membership test works. - * - * The string 's' is hashed once for each of the 'k' hash functions, as - * though we were planning to add it to the filter. Instead of adding it - * however, we examine the bit that we *would* have set, and consider its - * value. - * - * If the bit is 1 (set), the string we are hashing may be in the filter, - * since it would have set this bit when it was originally hashed. However, - * it may also be that another string just happened to produce a hash value - * that would also set this bit. That would be a false positive. This is why - * we have k > 1, so we can minimize the likelihood of false positives - * occuring. - * - * If every bit corresponding to every one of the k hashes of our query - * string is set, we can say with some probability of being correct that - * the string we are holding is indeed "in" the filter. However, we can - * never be sure. - * - * If, however, as we hash our string and peek at the resulting bit in the - * filter, we find the bit is 0 (not set)... well now, that's different. - * In this case, we can say with absolute certainty that the string we are - * holding is *not* in the filter, because if it were, this bit would have - * to be set. - * - * In this way, the Bloom filter can answer NO with absolute surety, but - * can only speak a qualified YES. - * - ******************************************************************************/ bool bloom_check(struct bloom_t *bloom, const char *s) { unsigned int hash; diff --git a/sys/include/bloom.h b/sys/include/bloom.h index 290ebbcf5b..19b692a2c5 100644 --- a/sys/include/bloom.h +++ b/sys/include/bloom.h @@ -1,3 +1,111 @@ +/** + * bloom.c + * + * Bloom filters + * + * HISTORY + * {x, y, z} + * A Bloom filter is a probibalistic : : : + * data structure with several interesting /|\ /|\ /|\ + * properties, such as low memory usage, / | X | X | \ + * asymmetric query confidence, and a very / |/ \|/ \| \ + * speedy O(k) membership test. / | | \ \ + * / /| /|\ |\ \ + * Because a Bloom filter can . . . . . . . . . + * accept any input that can be 00000000001000101010101010100010000000000 + * hashed effectively (such as " " " + * strings), that membership test \ | / + * tends to draw a crowd. TNSTAAFL, but \ | / + * as caveats go, the Bloom filters' are \ | / + * more interesting than incapacitating. \|/ + * : + * Most notably, it can tell you with certainty {w} + * that an item 'i' is *not* a member of set 's', + * but it can only tell you with some finite + * probability whether an item 'i' *is* a member + * of set 's'. + * + * Still, along with the intriguing possibility of using bitwise AND and OR + * to compute the logical union and intersection of two filters, the cheap + * cost of adding elements to the filter set, and the low memory requirements, + * the Bloom filter is a good choice for many applications. + * + * NOTES + * + * Let's look more closely at the probability values. + * + * Assume that a hash function selects each array position with equal + * probability. If m is the number of bits in the array, and k is the number + * of hash functions, then the probability that a certain bit is not set + * to 1 by a certain hash function during the insertion of an element is + * + * 1-(1/m). + * + * The probability that it is not set to 1 by any of the hash functions is + * + * (1-(1/m))^k. + * + * If we have inserted n elements, the probability that a certain bit is + * set 0 is + * + * (1-(1/m))^kn, + * + * Meaning that the probability said bit is set to 1 is therefore + * + * 1-([1-(1/m)]^kn). + * + * Now test membership of an element that is not in the set. Each of the k + * array positions computed by the hash functions is 1 with a probability + * as above. The probability of all of them being 1, which would cause the + * algorithm to erroneously claim that the element is in the set, is often + * given as + * + * (1-[1-(1/m)]^kn)^k ~~ (1 - e^(-kn/m))^k. + * + * This is not strictly correct as it assumes independence for the + * probabilities of each bit being set. However, assuming it is a close + * approximation we have that the probability of false positives descreases + * as m (the number of bits in the array) increases, and increases as n + * (the number of inserted elements) increases. For a given m and n, the + * value of k (the number of hash functions) that minimizes the probability + * is + * + * (m/n)ln(2) ~~ 0.7(m/n), + * + * which gives the false positive probability of + * + * 2^-k ~~ 0.6185^(m/n). + * + * The required number of bits m, given n and a desired false positive + * probability p (and assuming the optimal value of k is used) can be + * computed by substituting the optimal value of k in the probability + * expression above: + * + * p = (1 - e^(-(((m/n)ln(2))*(n/m))))^((m/n)ln(2)), + * + * which simplifies to + * + * ln(p) = -(m/n) * (ln2)^2. + * + * This results in the equation + * + * m = -((n*ln(p)) / ((ln(2))^2)) + * + * The classic filter uses + * + * 1.44*log2(1/eta) + * + * bits of space per inserted key, where eta is the false positive rate of + * the Bloom filter. + * + */ + +/** + * @file + * @autor Christian Mehlis <mehlis@inf.fu-berlin.de> + * @autor Freie Universität Berlin, Computer Systems & Telematics + */ + #ifndef _BLOOM_FILTER_H #define _BLOOM_FILTER_H @@ -5,8 +113,14 @@ #include <stdbool.h> #include <stdint.h> +/** + * hashfp_t hash function to use in thee filter + */ typedef unsigned int (*hashfp_t)(const char *); +/** + * struct bloom_t bloom filter object + */ struct bloom_t { size_t m; size_t k; @@ -14,9 +128,77 @@ struct bloom_t { hashfp_t *hash; }; +/** + * bloom_new Allocate and return a pointer to a new Bloom filter. + * + * For best results, make 'size' a power of 2. + * + * @param size size of the bit array in the filter + * @param num_hashes the number of hash functions + * @param functions varg function pointers, use hashfp_t + * + * @return An allocated bloom filter + * + */ struct bloom_t *bloom_new(size_t size, size_t num_hashes, ...); + +/** + * bloom_del Delete a Bloom filter. + * + * @param bloom The condemned + * @return nothing + * + */ void bloom_del(struct bloom_t *bloom); + +/** + * bloom_add Add a string to a Bloom filter. + * + * CAVEAT + * Once a string has been added to the filter, it cannot be "removed"! + * + * @param bloom Bloom filter + * @param s string to add + * @return nothing + * + */ void bloom_add(struct bloom_t *bloom, const char *s); + +/** + * bloom_check Determine if a string is in the Bloom filter. + * + * The string 's' is hashed once for each of the 'k' hash functions, as + * though we were planning to add it to the filter. Instead of adding it + * however, we examine the bit that we *would* have set, and consider its + * value. + * + * If the bit is 1 (set), the string we are hashing may be in the filter, + * since it would have set this bit when it was originally hashed. However, + * it may also be that another string just happened to produce a hash value + * that would also set this bit. That would be a false positive. This is why + * we have k > 1, so we can minimize the likelihood of false positives + * occuring. + * + * If every bit corresponding to every one of the k hashes of our query + * string is set, we can say with some probability of being correct that + * the string we are holding is indeed "in" the filter. However, we can + * never be sure. + * + * If, however, as we hash our string and peek at the resulting bit in the + * filter, we find the bit is 0 (not set)... well now, that's different. + * In this case, we can say with absolute certainty that the string we are + * holding is *not* in the filter, because if it were, this bit would have + * to be set. + * + * In this way, the Bloom filter can answer NO with absolute surety, but + * can only speak a qualified YES. + * + * @param bloom Bloom filter + * @param s string to check + * @return false if string does not exist in the filter + * @return true if string is may be in the filter + * + */ bool bloom_check(struct bloom_t *bloom, const char *s); #endif diff --git a/sys/include/hashes.h b/sys/include/hashes.h index eaf3c19bd3..e81aa196a4 100644 --- a/sys/include/hashes.h +++ b/sys/include/hashes.h @@ -1,6 +1,23 @@ -/****************************************************************************** +/** + * This file contains some simple hash function + * + * Copyright (C) 2013 Freie Universität Berlin + * + * This file subject to the terms and conditions of the GNU Lesser General + * Public License. See the file LICENSE in the top level directory for more + * details. + */ + +/** + * @file + * @autor Jason Linehan <patientulysses@gmail.com> + * @author Freie Universität Berlin, Computer Systems & Telematics + * @author Christian Mehlis <mehlis@inf.fu-berlin.de> + */ + +/** * djb2_hash - * ````````` + * * HISTORY * This algorithm (k=33) was first reported by Dan Bernstein many years * ago in comp.lang.c. Another version of this algorithm (now favored by @@ -10,8 +27,7 @@ * * The magic of number 33 (why it works better than many other constants, * prime or not) has never been adequately explained. - * - ******************************************************************************/ + */ static inline unsigned long djb2_hash(const char *str) { unsigned long hash; @@ -26,9 +42,9 @@ static inline unsigned long djb2_hash(const char *str) return hash; } -/****************************************************************************** +/** * sdbm_hash - * ````````` + * * HISTORY * This algorithm was created for sdbm (a public-domain reimplementation * of ndbm) database library. It was found to do well in scrambling bits, @@ -45,7 +61,7 @@ static inline unsigned long djb2_hash(const char *str) * out to be a prime. this is one of the algorithms used in berkeley db * (see sleepycat) and elsewhere. * - ******************************************************************************/ + */ static inline unsigned long sdbm_hash(const char *str) { unsigned long hash; @@ -60,9 +76,9 @@ static inline unsigned long sdbm_hash(const char *str) return hash; } -/****************************************************************************** +/** * lose lose - * ````````` + * * HISTORY * This hash function appeared in K&R (1st ed) but at least the reader * was warned: @@ -78,8 +94,7 @@ static inline unsigned long sdbm_hash(const char *str) * checking something like Knuth's Sorting and Searching, so it stuck. * It is now found mixed with otherwise respectable code, eg. cnews. sigh. * [see also: tpop] - * - ******************************************************************************/ + */ static inline unsigned long kr_hash(const char *str) { unsigned int hash; @@ -94,12 +109,11 @@ static inline unsigned long kr_hash(const char *str) return hash; } -/****************************************************************************** +/** * sax_hash - * ```````` - * Shift, Add, XOR * - ******************************************************************************/ + * Shift, Add, XOR + */ static inline unsigned int sax_hash(const char *key) { unsigned int h; @@ -114,14 +128,13 @@ static inline unsigned int sax_hash(const char *key) } -/****************************************************************************** +/** * dek_hash - * ```````` + * * HISTORY * Proposed by Donald E. Knuth in The Art Of Computer Programming Vol. 3, * under the topic of "Sorting and Search", Chapter 6.4. - * - ******************************************************************************/ + */ static inline unsigned int dek_hash(const char *str, unsigned int len) { unsigned int hash; @@ -138,13 +151,12 @@ static inline unsigned int dek_hash(const char *str, unsigned int len) } -/****************************************************************************** +/** * fnv_hash - * ```````` + * * NOTE * For a more fully featured and modern version of this hash, see fnv32.c - * - ******************************************************************************/ + */ static inline unsigned int fnv_hash(const char *str) { #define FNV_PRIME 0x811C9DC5 -- GitLab