Add support for backslash escaping of separators in documents (#329)

dvirsky · web-flow · commit dccd1447d5f3 · 2018-05-13T15:26:59.000+03:00
diff --git a/docs/Escaping.md b/docs/Escaping.md
@@ -0,0 +1,19 @@
+# Controlling Text Tokenization and Escaping
+
+At the moment, RediSearch uses a very simple tokenizer for documents, and a slightly more sophisticated tokenizer for queries. Both allow a degree of control over string escaping and tokenization. 
+
+Note: There is a different mechanism for tokenizing text and tag fields, this document refers only to text fields. For tag fields please refer to the [Tag Fields](/Tags) documentation. 
+
+## The Rules of Text Field Tokenization
+
+1. All punctuation marks and whitespaces (besides underscores) separate the document and queries into tokens. e.g. any character of `,.<>{}[]"':;!@#$%^&*()-+=~` will break the text into terms.  So the text `foo-bar.baz...bag` will be tokenized into `[foo, bar, baz, bag]`
+
+2. Escaping separators in both queries and documents is done by prepending a backslash to any separator. e.g. the text `hello\-world hello-world` will be tokenized as `[hello-world, hello, world]`. **NOTE** that in most languages you will need an extra backslash when formatting the document or query, to signify an actual backslash, so the actual text in redis-cli for example, will be entered as `hello\\-world`. 
+
+2. Underscores (`_`) are not used as separators in either document or query. So the text `hello_world` will remain as is after tokenization. 
+
+3. Repeating spaces or punctuation marks are stripped. 
+
+4. In latin characters, everything gets converted to lowercase. 
+
+
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -34,7 +34,7 @@ pages:
       - 'Query Syntax': 'Query_Syntax.md'
       - 'Stop-Words': 'Stopwords.md'
       - 'Aggregations (NEW!)': 'Aggregations.md'
-
+      - 'Tokenization and Escaping': 'Escaping.md'
       - 'Sortable Values': 'Sorting.md'
       - 'Tag Fields': 'Tags.md'
       - 'Highlighting Results': Highlight.md
diff --git a/src/tests/test_tokenize.c b/src/tests/test_tokenize.c
@@ -0,0 +1,36 @@
+#include "test_util.h"
+#include "../tokenize.h"
+#include "../stemmer.h"
+#include "../rmutil/alloc.h"
+
+int testTokenize() {
+  Stemmer *st = NewStemmer(SnowballStemmer, "english");
+
+  RSTokenizer *tk = GetSimpleTokenizer(st, DefaultStopWordList());
+  char *txt = strdup("hello worlds    - - -,,, . . . -=- hello\\-world to be שלום עולם");
+  const char *expected[] = {"hello", "worlds", "hello-world", "שלום", "עולם"};
+  const char *stems[] = {NULL, "+world", NULL, NULL, NULL, NULL};
+  tk->Start(tk, txt, strlen(txt), TOKENIZE_DEFAULT_OPTIONS);
+  Token tok;
+  int i = 0;
+  while (tk->Next(tk, &tok)) {
+    ;
+
+    ASSERT(tok.tokLen == strlen(expected[i]));
+    ASSERT(!strncmp(tok.tok, expected[i], tok.tokLen));
+    if (!stems[i]) {
+      ASSERT(tok.stem == NULL);
+    } else {
+      ASSERT(!strncmp(tok.stem, stems[i], tok.stemLen));
+    }
+    i++;
+  }
+  free(txt);
+
+  RETURN_TEST_SUCCESS;
+}
+
+TEST_MAIN({
+  RMUTil_InitAlloc();
+  TESTFUNC(testTokenize);
+})
diff --git a/src/tokenize.c b/src/tokenize.c
@@ -46,16 +46,22 @@ static char *DefaultNormalize(char *s, char *dst, size_t *len) {
     realDest = dst;          \
     memcpy(realDest, s, ii); \
   }
-
+  // set to 1 if the previous character was a backslash escape
+  int escaped = 0;
   for (size_t ii = 0; ii < origLen; ++ii) {
     if (isupper(s[ii])) {
       SWITCH_DEST();
       realDest[dstLen++] = tolower(s[ii]);
-    } else if (isblank(s[ii]) || iscntrl(s[ii])) {
+    } else if ((isblank(s[ii]) && !escaped) || iscntrl(s[ii])) {
+      SWITCH_DEST();
+    } else if (s[ii] == '\\' && !escaped) {
       SWITCH_DEST();
+      escaped = 1;
+      continue;
     } else {
       dst[dstLen++] = s[ii];
     }
+    escaped = 0;
   }
 
   *len = dstLen;
@@ -113,7 +119,6 @@ uint32_t simpleTokenizer_Next(RSTokenizer *base, Token *t) {
         t->stemLen = sl;
       }
     }
-
     return ctx->lastOffset;
   }
 
diff --git a/src/toksep.h b/src/toksep.h
@@ -5,11 +5,10 @@
 #include <stdlib.h>
 //! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ ` { | } ~
 static const char ToksepMap_g[256] = {
-        [' '] = 1, ['\t'] = 1, [','] = 1, ['.'] = 1, ['/'] = 1, ['('] = 1, [')'] = 1,
-        ['{'] = 1, ['}'] = 1,  ['['] = 1, [']'] = 1, [':'] = 1, [';'] = 1, ['\\'] = 1,
-        ['~'] = 1, ['!'] = 1,  ['@'] = 1, ['#'] = 1, ['$'] = 1, ['%'] = 1, ['^'] = 1,
-        ['&'] = 1, ['*'] = 1,  ['-'] = 1, ['='] = 1, ['+'] = 1, ['|'] = 1, ['\''] = 1,
-        ['`'] = 1, ['"'] = 1,  ['<'] = 1, ['>'] = 1, ['?'] = 1,
+    [' '] = 1, ['\t'] = 1, [','] = 1,  ['.'] = 1, ['/'] = 1, ['('] = 1, [')'] = 1, ['{'] = 1,
+    ['}'] = 1, ['['] = 1,  [']'] = 1,  [':'] = 1, [';'] = 1, ['~'] = 1, ['!'] = 1, ['@'] = 1,
+    ['#'] = 1, ['$'] = 1,  ['%'] = 1,  ['^'] = 1, ['&'] = 1, ['*'] = 1, ['-'] = 1, ['='] = 1,
+    ['+'] = 1, ['|'] = 1,  ['\''] = 1, ['`'] = 1, ['"'] = 1, ['<'] = 1, ['>'] = 1, ['?'] = 1,
 };
 
 /**
@@ -20,7 +19,7 @@ static inline char *toksep(char **s, size_t *tokLen) {
   uint8_t *pos = (uint8_t *)*s;
   char *orig = *s;
   for (; *pos; ++pos) {
-    if (ToksepMap_g[*pos]) {
+    if (ToksepMap_g[*pos] && ((char *)pos == orig || *(pos - 1) != '\\')) {
       *s = (char *)++pos;
       *tokLen = ((char *)pos - orig) - 1;
       if (!*pos) {

Original file line number	Diff line number	Diff line change
`@@ -46,16 +46,22 @@ static char DefaultNormalize(char s, char dst, size_t len) {`
`46`	`46`	`realDest = dst; \`
`47`	`47`	`memcpy(realDest, s, ii); \`
`48`	`48`	`}`
`49`		`-`
	`49`	`+ // set to 1 if the previous character was a backslash escape`
	`50`	`+ int escaped = 0;`
`50`	`51`	`for (size_t ii = 0; ii < origLen; ++ii) {`
`51`	`52`	`if (isupper(s[ii])) {`
`52`	`53`	`SWITCH_DEST();`
`53`	`54`	`realDest[dstLen++] = tolower(s[ii]);`
`54`		`- } else if (isblank(s[ii]) \|\| iscntrl(s[ii])) {`
	`55`	`+ } else if ((isblank(s[ii]) && !escaped) \|\| iscntrl(s[ii])) {`
	`56`	`+ SWITCH_DEST();`
	`57`	`+ } else if (s[ii] == '\\' && !escaped) {`
`55`	`58`	`SWITCH_DEST();`
	`59`	`+ escaped = 1;`
	`60`	`+ continue;`
`56`	`61`	`} else {`
`57`	`62`	`dst[dstLen++] = s[ii];`
`58`	`63`	`}`
	`64`	`+ escaped = 0;`
`59`	`65`	`}`
`60`	`66`
`61`	`67`	`*len = dstLen;`
`@@ -113,7 +119,6 @@ uint32_t simpleTokenizer_Next(RSTokenizer base, Token t) {`
`113`	`119`	`t->stemLen = sl;`
`114`	`120`	`}`
`115`	`121`	`}`
`116`		`-`
`117`	`122`	`return ctx->lastOffset;`
`118`	`123`	`}`
`119`	`124`