#include "citserver.h"
#include "support.h"
#include "config.h"
-#include "serv_extensions.h"
#include "database.h"
#include "msgbase.h"
#include "control.h"
#include "ft_wordbreaker.h"
#include "crc16.h"
+/*
+ * Noise words are not included in search indices.
+ * NOTE: if the noise word list is altered in any way, the FT_WORDBREAKER_ID
+ * must also be changed, so that the index is rebuilt.
+ */
+static char *noise_words[] = {
+ "about",
+ "after",
+ "all",
+ "also",
+ "an",
+ "and",
+ "another",
+ "any",
+ "are",
+ "as",
+ "at",
+ "be",
+ "because",
+ "been",
+ "before",
+ "being",
+ "between",
+ "both",
+ "but",
+ "by",
+ "came",
+ "can",
+ "come",
+ "could",
+ "did",
+ "do",
+ "each",
+ "for",
+ "from",
+ "get",
+ "got",
+ "had",
+ "has",
+ "have",
+ "he",
+ "her",
+ "here",
+ "him",
+ "himself",
+ "his",
+ "how",
+ "if",
+ "in",
+ "into",
+ "is",
+ "it",
+ "like",
+ "make",
+ "many",
+ "me",
+ "might",
+ "more",
+ "most",
+ "much",
+ "must",
+ "my",
+ "never",
+ "now",
+ "of",
+ "on",
+ "only",
+ "or",
+ "other",
+ "our",
+ "out",
+ "over",
+ "said",
+ "same",
+ "see",
+ "should",
+ "since",
+ "some",
+ "still",
+ "such",
+ "take",
+ "than",
+ "that",
+ "the",
+ "their",
+ "them",
+ "then",
+ "there",
+ "these",
+ "they",
+ "this",
+ "those",
+ "through",
+ "to",
+ "too",
+ "under",
+ "up",
+ "very",
+ "was",
+ "way",
+ "we",
+ "well",
+ "were",
+ "what",
+ "where",
+ "which",
+ "while",
+ "with",
+ "would",
+ "you",
+ "your"
+};
+
+/*
+ * Compare function
+ */
+int intcmp(const void *rec1, const void *rec2) {
+ int i1, i2;
+
+ i1 = *(const int *)rec1;
+ i2 = *(const int *)rec2;
+
+ if (i1 > i2) return(1);
+ if (i1 < i2) return(-1);
+ return(0);
+}
+
void wordbreaker(char *text, int *num_tokens, int **tokens) {
ptr = text;
word_start = NULL;
- while (ptr++, *ptr) {
+ while (*ptr) {
ch = *ptr;
if (isalnum(ch)) {
if (!word_start) {
word_start = ptr;
}
}
- else {
- if (word_start) {
- word_end = ptr;
- --word_end;
-
- /* extract the word */
- word_len = word_end - word_start + 1;
- safestrncpy(word, word_start, sizeof word);
+ ++ptr;
+ ch = *ptr;
+ if ( (!isalnum(ch)) && (word_start) ) {
+ word_end = ptr;
+ --word_end;
+
+ /* extract the word */
+ word_len = word_end - word_start + 1;
+ safestrncpy(word, word_start, sizeof word);
+ if (word_len >= sizeof word) {
+ lprintf(CTDL_DEBUG, "Invalid word length: %d\n", word_len);
+ word[(sizeof word_len) - 1] = 0;
+ }
+ else {
word[word_len] = 0;
- word_start = NULL;
-
- /* are we ok with the length? */
- if ( (word_len >= WB_MIN)
- && (word_len <= WB_MAX) ) {
- for (i=0; i<word_len; ++i) {
- word[i] = tolower(word[i]);
- }
- word_crc = (int)
- CalcCRC16Bytes(word_len, word);
-
- ++wb_num_tokens;
- if (wb_num_tokens > wb_num_alloc) {
- wb_num_alloc += 512;
- wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
- }
- wb_tokens[wb_num_tokens - 1] = word_crc;
+ }
+ word_start = NULL;
+
+ /* disqualify noise words */
+ for (i=0; i<(sizeof(noise_words)/sizeof(char *)); ++i) {
+ if (!strcasecmp(word, noise_words[i])) {
+ word_len = 0;
+ break;
+ }
+ }
+
+ /* are we ok with the length? */
+ if ( (word_len >= WB_MIN)
+ && (word_len <= WB_MAX) ) {
+ for (i=0; i<word_len; ++i) {
+ word[i] = tolower(word[i]);
}
+ word_crc = (int)
+ CalcCRC16Bytes(word_len, word);
+
+ ++wb_num_tokens;
+ if (wb_num_tokens > wb_num_alloc) {
+ wb_num_alloc += 512;
+ wb_tokens = realloc(wb_tokens, (sizeof(int) * wb_num_alloc));
+ }
+ wb_tokens[wb_num_tokens - 1] = word_crc;
+ }
+ }
+ }
+
+ /* sort and purge dups */
+ if (wb_num_tokens > 1) {
+ qsort(wb_tokens, wb_num_tokens, sizeof(int), intcmp);
+ for (i=0; i<(wb_num_tokens-1); ++i) {
+ if (wb_tokens[i] == wb_tokens[i+1]) {
+ memmove(&wb_tokens[i], &wb_tokens[i+1],
+ ((wb_num_tokens - i - 1)*sizeof(int)));
+ --wb_num_tokens;
+ --i;
}
}
}