/*
* hash_str.c,v 2.1 1992/08/11 00:29:20 pete Exp
* hash_str.c,v
* Revision 2.1 1992/08/11 00:29:20 pete
* Added proper RCS log and id entries.
*
*/
#include <stdlib.h>
#if defined(DBUG_NEW) && !defined(_DBUG_NEW_H_)
#include "dbug_new.h"
#endif
/*
* From "Fast Hashing of Variable Length Text Strings" by Peter K. Pearson,
* June 1990, Communications of the ACM, pp 677-680. This implementation
* uses the variation to achieve a 16 bit hash index instead of the 8 bit as
* specified in the algorithm.
*
* The advantages of the algorithm are:
* - No restriction is placed on the length of the text string.
* - The length of the string need not be known before hand.
* - Very little arithmetic is performed on each character being
* hashed.
* - Similar strings are not likely to collide
* - Minimal, perfect hashing functions can be built in this form.
*
* Applying the test case to /usr/lib/dict/words produced 6871 collisions
* over 23739 entries vs. the more typical hash function (hash_simple) which
* produced 14665 collions over 23739 entries. On the other hand, smaller
* lists of words (such as the passwd file with 136 entries), had similiar
* number of collisions (~90 with a table size of 50).
*/
/*
* The psuedo random permutation of integers from 0-255 (Table 1 in the
* article.
*/
#define ARRAY_SIZE 256
static int table[ARRAY_SIZE] =
{
1, 87, 49, 12, 176, 178, 102, 166, 121, 193, 6, 84, 249, 230, 44, 163,
14, 197, 213, 181, 161, 85, 218, 80, 64, 239, 24, 226, 236, 142, 38, 200,
110, 177, 104, 103, 141, 253, 255, 50, 77, 101, 81, 18, 45, 96, 31, 222,
25, 107, 190, 70, 86, 237, 240, 34, 72, 242, 20,214, 244, 227, 149, 235,
97, 234, 57, 22, 60, 250, 82, 175, 208, 5, 127, 199, 111, 62, 135, 248,
174, 169, 211, 58, 66, 154, 106, 195, 245, 171, 17, 187, 182, 179, 0, 243,
132, 56, 148, 75, 128, 133, 158, 100, 130, 126, 91, 13, 153, 246, 216, 219,
119, 68, 223, 78, 83, 88, 201, 99, 122, 11, 92, 32, 136, 114, 52, 10,
138, 30, 48, 183, 156, 35, 61, 26, 143, 74, 251, 94, 129, 162, 63, 152,
170, 7, 115, 167, 241, 206, 3, 150, 55, 59, 151, 220, 90, 53, 23, 131,
125, 173, 15, 238, 79, 95, 89, 16, 105, 137, 225, 224, 217, 160, 37, 123,
118, 73, 2, 157, 46, 116, 9, 145, 134, 228, 207, 212, 202, 215, 69, 229,
27, 188, 67, 124, 168, 252, 42, 4, 29, 108, 21, 247, 19, 205, 39, 203,
233, 40, 186, 147, 198, 192, 155, 33, 164, 191, 98, 204, 165, 180, 117, 76,
140, 36, 210, 172, 41, 54, 159, 8, 185, 232, 113, 196, 231, 47, 146, 120,
51, 65, 28, 144, 254, 221, 93, 189, 194, 139, 112, 43, 71, 109, 184, 209
};
static int
calc_hash (const char *str, int offset)
{
int value;
if (!str)
return 0;
/*
* Determine the initial value (i.e. use the offset). This is no
* different if the offset == 0.
*/
value = table[*str++ + offset];
/*
* Now calculate the hash value
*/
while (*str) {
value = table[value ^ *str++];
}
return value;
}
int
hash_str (const char *str)
{
if (str && *str)
return (calc_hash (str, 0) << 8) | calc_hash (str, 1);
else if (str)
return *str;
else
return 0;
}
#ifdef TEST
static int
hash_simple (const char *str)
{
int value = 0;
int i;
if (!str)
return 0;
for (i = 0; *str; i++) {
value ^= (*str++ << ((i & 3)*8));
}
return value;
}
/*
* Let's use /usr/lib/dict/words to verify how good this hash function
* really is.
*/
main (int argc, char **argv)
{
FILE *fptr;
char *file = "/usr/lib/dict/words";
int index; /* index of hash values */
int count; /* number of words */
int use_trivial = 0; /* use simple hash function flag */
int collisions; /* how many collisions */
int *results; /* where they are distributed */
char buf[BUFSIZ]; /* buffer to read word into */
int i;
int table_size = 1 << 15;
int opt;
extern char *optarg;
while ((opt = getopt (argc, argv, "f:s:t")) != EOF) {
switch (opt) {
case 'f':
file = optarg;
break;
case 's':
table_size = atoi (optarg);
break;
case 't':
use_trivial = 1;
break;
}
}
if ((fptr = fopen (file, "r")) == NULL) {
perror (file);
exit (1);
}
results = (int *) malloc (sizeof (int) * table_size);
for (i = 0; i < table_size; i++) {
results[i] = 0;
}
count = 0;
collisions = 0;
while (fgets (buf, sizeof (buf), fptr) != NULL) {
if ((count % 500 ) == 0) {
printf (".");
fflush (stdout);
}
++count;
if (use_trivial)
index = hash_simple (buf) % table_size;
else
index = hash_str (buf) % table_size;
if (results[index]) {
++collisions;
}
++results[index];
}
printf ("n%d entries, %d collisionsn", count, collisions);
}
/*
* A simple function to check that the above array was typed in (yuch)
* correctly. The array is checked to make sure are values are between
* 0, 255; there are no duplicates; there are only 256 entries; all 256
* values are used. If there are any errors, the table is printed out (in
* the same format as the one on page 678.
*/
check_table ()
{
int i;
int array[ARRAY_SIZE];
int err = 0;
/* initialize the array */
for (i = 0; i < ARRAY_SIZE; i++)
array[i] = -1;
for (i = 0; i < ARRAY_SIZE; i++) {
if (table[i] < 0 || table[i] >= ARRAY_SIZE) {
++err;
printf ("table[%d] = %d (out of bounds)n", i, table[i]);
continue;
}
if (array[table[i]] != -1) {
++err;
printf ("table[%d] == table[%d] == %dn", i, array[i],
table[i]);
}
array[table[i]] = i;
}
/*
* Make sure everything is used
*/
if (err == 0) {
for (i = 0; i < ARRAY_SIZE; i++)
if (array[i] == -1) {
printf ("%d not in tablen", i);
++err;
}
}
/*
* Print out the table if there are any errors
*/
if (err) {
for (i = 0; i < ARRAY_SIZE; i++) {
printf ("%3d ", table[i]);
if ((i + 1) % 16 == 0)
printf ("n");
}
}
fprintf (stderr, "%d errorsn", err);
exit (err == 0?0:1);
}
#endif /* TEST */
|
|