* Completed the wordbreaker for the fulltext indexer.
authorArt Cancro <ajc@citadel.org>
Tue, 17 May 2005 16:25:25 +0000 (16:25 +0000)
committerArt Cancro <ajc@citadel.org>
Tue, 17 May 2005 16:25:25 +0000 (16:25 +0000)
citadel/ChangeLog
citadel/Makefile.in
citadel/crc16.c [new file with mode: 0644]
citadel/crc16.d [new file with mode: 0644]
citadel/crc16.h [new file with mode: 0644]
citadel/crc16.o [new file with mode: 0644]
citadel/ft_wordbreaker.c
citadel/ft_wordbreaker.d
citadel/ft_wordbreaker.h
citadel/ft_wordbreaker.o

index 7bf15f790661f32a893aaef980141aa0dfa3c1b9..2ae16a9a407062095094c1a965204c81d094279a 100644 (file)
@@ -1,4 +1,7 @@
  $Log$
+ Revision 647.6  2005/05/17 16:25:23  ajc
+ * Completed the wordbreaker for the fulltext indexer.
+
  Revision 647.5  2005/05/17 04:04:46  ajc
  * More glue code for the fulltext indexer.
 
@@ -6702,4 +6705,3 @@ Sat Jul 11 00:20:48 EDT 1998 Nathan Bryant <bryant@cs.usm.maine.edu>
 
 Fri Jul 10 1998 Art Cancro <ajc@uncensored.citadel.org>
        * Initial CVS import
-
index 2b52fb8026554b6e82668896796e609fa89c68e1..e0bd45506baffd41500047308d0fe6fb2a86724d 100644 (file)
@@ -32,7 +32,7 @@ SERV_MODULES=serv_chat.o \
        serv_mrtg.o \
        serv_imap.o imap_fetch.o imap_misc.o imap_search.o \
                imap_store.o imap_tools.o \
-       serv_fulltext.o ft_wordbreaker.o \
+       serv_fulltext.o ft_wordbreaker.o crc16.o \
        serv_network.o \
        serv_listsub.o \
        serv_netfilter.o \
@@ -96,7 +96,8 @@ SOURCES=aidepost.c auth.c base64.c chkpwd.c citadel.c citadel_ipc.c \
        serv_spam.c serv_test.c serv_mrtg.c serv_spam.c serv_upgrade.c \
        serv_vandelay.c serv_vcard.c server_main.c setup.c snprintf.c \
        stress.c support.c sysdep.c tools.c user_ops.c userlist.c \
-       whobbs.c vcard.c serv_notes.c serv_fulltext.c ft_wordbreaker.c
+       whobbs.c vcard.c serv_notes.c serv_fulltext.c ft_wordbreaker.c \
+       crc16.c
 
 DEP_FILES=$(SOURCES:.c=.d)
 
diff --git a/citadel/crc16.c b/citadel/crc16.c
new file mode 100644 (file)
index 0000000..abfcd8e
--- /dev/null
@@ -0,0 +1,153 @@
+/****************************************************************************
+
+  Filename:     crc16.c
+  Description:  Cyclic Redundancy Check 16 functions
+  Created:      24-Feb-1999
+
+  Copyright (c) 1999-2003, Indigo Systems Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  Neither the name of the Indigo Systems Corporation nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+
+****************************************************************************/
+
+#include "crc16.h"
+
+#ifdef _OPT_SIZE
+
+
+/*
+ *  ===== ByteCRC16 =====
+ *      Calculate (update) the CRC16 for a single 8-bit byte
+ */
+int ByteCRC16(int value, int crcin)
+{
+    int k = (((crcin >> 8) ^ value) & 255) << 8;
+    int crc = 0;
+    int bits = 8;
+    do
+    {
+       if (( crc ^ k ) & 0x8000)
+           crc = (crc << 1) ^ 0x1021;
+       else
+           crc <<= 1;
+       k <<= 1;
+    }
+    while (--bits);
+    return ((crcin << 8) ^ crc);
+}
+
+#else
+
+const CRC16 ccitt_16Table[] = {
+   0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50A5, 0x60C6, 0x70E7,
+   0x8108, 0x9129, 0xA14A, 0xB16B, 0xC18C, 0xD1AD, 0xE1CE, 0xF1EF,
+   0x1231, 0x0210, 0x3273, 0x2252, 0x52B5, 0x4294, 0x72F7, 0x62D6,
+   0x9339, 0x8318, 0xB37B, 0xA35A, 0xD3BD, 0xC39C, 0xF3FF, 0xE3DE,
+   0x2462, 0x3443, 0x0420, 0x1401, 0x64E6, 0x74C7, 0x44A4, 0x5485,
+   0xA56A, 0xB54B, 0x8528, 0x9509, 0xE5EE, 0xF5CF, 0xC5AC, 0xD58D,
+   0x3653, 0x2672, 0x1611, 0x0630, 0x76D7, 0x66F6, 0x5695, 0x46B4,
+   0xB75B, 0xA77A, 0x9719, 0x8738, 0xF7DF, 0xE7FE, 0xD79D, 0xC7BC,
+   0x48C4, 0x58E5, 0x6886, 0x78A7, 0x0840, 0x1861, 0x2802, 0x3823,
+   0xC9CC, 0xD9ED, 0xE98E, 0xF9AF, 0x8948, 0x9969, 0xA90A, 0xB92B,
+   0x5AF5, 0x4AD4, 0x7AB7, 0x6A96, 0x1A71, 0x0A50, 0x3A33, 0x2A12,
+   0xDBFD, 0xCBDC, 0xFBBF, 0xEB9E, 0x9B79, 0x8B58, 0xBB3B, 0xAB1A,
+   0x6CA6, 0x7C87, 0x4CE4, 0x5CC5, 0x2C22, 0x3C03, 0x0C60, 0x1C41,
+   0xEDAE, 0xFD8F, 0xCDEC, 0xDDCD, 0xAD2A, 0xBD0B, 0x8D68, 0x9D49,
+   0x7E97, 0x6EB6, 0x5ED5, 0x4EF4, 0x3E13, 0x2E32, 0x1E51, 0x0E70,
+   0xFF9F, 0xEFBE, 0xDFDD, 0xCFFC, 0xBF1B, 0xAF3A, 0x9F59, 0x8F78,
+   0x9188, 0x81A9, 0xB1CA, 0xA1EB, 0xD10C, 0xC12D, 0xF14E, 0xE16F,
+   0x1080, 0x00A1, 0x30C2, 0x20E3, 0x5004, 0x4025, 0x7046, 0x6067,
+   0x83B9, 0x9398, 0xA3FB, 0xB3DA, 0xC33D, 0xD31C, 0xE37F, 0xF35E,
+   0x02B1, 0x1290, 0x22F3, 0x32D2, 0x4235, 0x5214, 0x6277, 0x7256,
+   0xB5EA, 0xA5CB, 0x95A8, 0x8589, 0xF56E, 0xE54F, 0xD52C, 0xC50D,
+   0x34E2, 0x24C3, 0x14A0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405,
+   0xA7DB, 0xB7FA, 0x8799, 0x97B8, 0xE75F, 0xF77E, 0xC71D, 0xD73C,
+   0x26D3, 0x36F2, 0x0691, 0x16B0, 0x6657, 0x7676, 0x4615, 0x5634,
+   0xD94C, 0xC96D, 0xF90E, 0xE92F, 0x99C8, 0x89E9, 0xB98A, 0xA9AB,
+   0x5844, 0x4865, 0x7806, 0x6827, 0x18C0, 0x08E1, 0x3882, 0x28A3,
+   0xCB7D, 0xDB5C, 0xEB3F, 0xFB1E, 0x8BF9, 0x9BD8, 0xABBB, 0xBB9A,
+   0x4A75, 0x5A54, 0x6A37, 0x7A16, 0x0AF1, 0x1AD0, 0x2AB3, 0x3A92,
+   0xFD2E, 0xED0F, 0xDD6C, 0xCD4D, 0xBDAA, 0xAD8B, 0x9DE8, 0x8DC9,
+   0x7C26, 0x6C07, 0x5C64, 0x4C45, 0x3CA2, 0x2C83, 0x1CE0, 0x0CC1,
+   0xEF1F, 0xFF3E, 0xCF5D, 0xDF7C, 0xAF9B, 0xBFBA, 0x8FD9, 0x9FF8,
+   0x6E17, 0x7E36, 0x4E55, 0x5E74, 0x2E93, 0x3EB2, 0x0ED1, 0x1EF0
+};
+
+#define ByteCRC16(v, crc) \
+       (unsigned short)((crc << 8) ^  ccitt_16Table[((crc >> 8) ^ (v)) & 255])
+
+/*
+ *  ===== CalcCRC16Words =====
+ *      Calculate the CRC for a buffer of 16-bit words.  Supports both
+ *  Little and Big Endian formats using conditional compilation.
+ *      Note: minimum count is 1 (0 case not handled)
+ */
+CRC16 CalcCRC16Words(unsigned int count, short *buffer) {
+
+    int crc = 0;
+
+    do {
+
+       int value = *buffer++;
+#ifdef _BIG_ENDIAN
+       crc = ByteCRC16(value >> 8, crc);
+       crc = ByteCRC16(value, crc);
+#else
+       crc = ByteCRC16(value, crc);
+       crc = ByteCRC16(value >> 8, crc);
+#endif
+    }
+       while (--count);
+    return (CRC16) crc;
+}
+
+#endif /* _OPT_SIZE */
+
+#ifdef _CRC16_BYTES
+
+/*
+ *  ===== CalcCRC16Bytes =====
+ *      Calculate the CRC for a buffer of 8-bit words.
+ *      Note: minimum count is 1 (0 case not handled)
+ */
+CRC16 CalcCRC16Bytes(unsigned int count, char *buffer) {
+
+    int crc = 0;
+
+    do {
+
+       int value = *buffer++;
+       crc = ByteCRC16(value, crc);
+    }
+       while (--count);
+    return crc;
+}
+
+#endif /* _CRC16_BYTES */
+
+
diff --git a/citadel/crc16.d b/citadel/crc16.d
new file mode 100644 (file)
index 0000000..9ca0d8c
--- /dev/null
@@ -0,0 +1 @@
+crc16.o crc16/.o crc16.d: crc16.c crc16.h
diff --git a/citadel/crc16.h b/citadel/crc16.h
new file mode 100644 (file)
index 0000000..9f13af7
--- /dev/null
@@ -0,0 +1,67 @@
+/****************************************************************************
+
+  Filename:     crc16.h
+  Description:  Cyclic Redundancy Check 16 functions
+  Created:      24-Feb-1999
+
+  Copyright (c) 2002-2003, Indigo Systems Corporation
+  All rights reserved.
+
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are met:
+
+  Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+  Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+  Neither the name of the Indigo Systems Corporation nor the names of its
+  contributors may be used to endorse or promote products derived from this
+  software without specific prior written permission.
+
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+  SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+  INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+  CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+  ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+  THE POSSIBILITY OF SUCH DAMAGE.
+
+****************************************************************************/
+
+#define _CRC16_BYTES   1               /* ig */
+
+#ifndef __CRC16_H__
+#define __CRC16_H__
+
+#ifdef __cplusplus
+extern "C"
+{
+#endif
+
+typedef unsigned short CRC16;
+
+#ifdef _OPT_SIZE
+    int ByteCRC16(int value, int crcin);
+#else
+    CRC16 CalcCRC16Words(unsigned int count, short *buffer);
+#endif
+
+#ifdef _CRC16_BYTES
+    CRC16 CalcCRC16Bytes(unsigned int count, char *buffer);
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CRC16_H__ */
+
+
+
diff --git a/citadel/crc16.o b/citadel/crc16.o
new file mode 100644 (file)
index 0000000..bf0808c
Binary files /dev/null and b/citadel/crc16.o differ
index 4fc3f3aace3373f878e69762eb281fa25a80d337..6c69175d1b93b79066d9fa330bef48d33025dd9b 100644 (file)
@@ -28,6 +28,7 @@
 #endif
 
 #include <sys/wait.h>
+#include <ctype.h>
 #include <string.h>
 #include <limits.h>
 #include "citadel.h"
@@ -42,6 +43,7 @@
 #include "control.h"
 #include "tools.h"
 #include "ft_wordbreaker.h"
+#include "crc16.h"
 
 
 void wordbreaker(char *text, int *num_tokens, int **tokens) {
@@ -50,12 +52,66 @@ void wordbreaker(char *text, int *num_tokens, int **tokens) {
        int wb_num_alloc = 0;
        int *wb_tokens = NULL;
 
-       wb_num_tokens = 3;
-       wb_tokens = malloc(wb_num_tokens * sizeof(int));
+       char *ptr;
+       char *word_start;
+       char *word_end;
+       char ch;
+       int word_len;
+       char word[256];
+       int i;
+       int word_crc;
 
-       wb_tokens[0] = 6;
-       wb_tokens[1] = 7;       /* FIXME this obviously isn't a wordbreaker */
-       wb_tokens[2] = 8;
+       if (text == NULL) {             /* no NULL text please */
+               *num_tokens = 0;
+               *tokens = NULL;
+               return;
+       }
+
+       if (text[0] == 0) {             /* no empty text either */
+               *num_tokens = 0;
+               *tokens = NULL;
+               return;
+       }
+
+       ptr = text;
+       word_start = NULL;
+       while (ptr++, *ptr) {
+               ch = *ptr;
+               if (isalnum(ch)) {
+                       if (!word_start) {
+                               word_start = ptr;
+                       }
+               }
+               else {
+                       if (word_start) {
+                               word_end = ptr;
+                               --word_end;
+
+                               /* extract the word */
+                               word_len = word_end - word_start + 1;
+                               safestrncpy(word, word_start, sizeof word);
+                               word[word_len] = 0;
+                               word_start = NULL;
+
+                               /* are we ok with the length? */
+                               if ( (word_len >= WB_MIN)
+                                  && (word_len <= WB_MAX) ) {
+                                       for (i=0; i<word_len; ++i) {
+                                               word[i] = tolower(word[i]);
+                                       }
+                                       word_crc = (int)
+                                               CalcCRC16Bytes(word_len, word);
+
+                                       ++wb_num_tokens;
+                                       if (wb_num_tokens > wb_num_alloc) {
+                                               wb_num_alloc += 512;
+                                               wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
+                                       }
+                                       wb_tokens[wb_num_tokens - 1] = word_crc;
+                               }
+                       }
+               }
+       }
 
        *num_tokens = wb_num_tokens;
        *tokens = wb_tokens;
index 827722090a68f14e04d8b41368a4cf855c03564d..27308fbc630767678284fbfdc427160f6fc1e8ea 100644 (file)
@@ -24,7 +24,8 @@ ft_wordbreaker.o ft_wordbreaker/.o ft_wordbreaker.d: ft_wordbreaker.c sysdep.h /
   /usr/include/asm/errno.h /usr/include/sys/time.h \
   /usr/include/sys/wait.h /usr/include/sys/resource.h \
   /usr/include/bits/resource.h /usr/include/bits/waitflags.h \
-  /usr/include/bits/waitstatus.h /usr/include/string.h \
+  /usr/include/bits/waitstatus.h /usr/include/ctype.h \
+  /usr/include/string.h \
   /usr/lib/gcc/i386-redhat-linux/3.4.2/include/limits.h \
   /usr/lib/gcc/i386-redhat-linux/3.4.2/include/syslimits.h \
   /usr/include/limits.h /usr/include/bits/posix1_lim.h \
@@ -56,4 +57,4 @@ ft_wordbreaker.o ft_wordbreaker/.o ft_wordbreaker.d: ft_wordbreaker.c sysdep.h /
   /usr/include/openssl/ssl23.h sysdep_decls.h /usr/include/pthread.h \
   /usr/include/sched.h /usr/include/bits/initspin.h citserver.h \
   serv_extensions.h support.h config.h database.h msgbase.h control.h \
-  tools.h ft_wordbreaker.h
+  tools.h ft_wordbreaker.h crc16.h
index 97628357461eebe8cd6c8c09b9345c520f0278da..57bd97b7aeb8f0b20773f33f44b4a2855e5044bb 100644 (file)
  */
 #define        FT_WORDBREAKER_ID       0x0001
 
+/*
+ * Minimum and maximum length of words to index
+ */
+#define WB_MIN                 3
+#define WB_MAX                 40
+
 void wordbreaker(char *text, int *num_tokens, int **tokens);
index 67f2f693286b91545517729c16c4bcaa47c9ed25..a90949c621edab9c32d56581ea9dbd7c369d17db 100644 (file)
Binary files a/citadel/ft_wordbreaker.o and b/citadel/ft_wordbreaker.o differ