Skip to content

Commit dccd144

Browse files
authored
Add support for backslash escaping of separators in documents (#329)
1 parent f4e0e3f commit dccd144

File tree

5 files changed

+69
-10
lines changed

5 files changed

+69
-10
lines changed

docs/Escaping.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Controlling Text Tokenization and Escaping
2+
3+
At the moment, RediSearch uses a very simple tokenizer for documents, and a slightly more sophisticated tokenizer for queries. Both allow a degree of control over string escaping and tokenization.
4+
5+
Note: There is a different mechanism for tokenizing text and tag fields, this document refers only to text fields. For tag fields please refer to the [Tag Fields](/Tags) documentation.
6+
7+
## The Rules of Text Field Tokenization
8+
9+
1. All punctuation marks and whitespaces (besides underscores) separate the document and queries into tokens. e.g. any character of `,.<>{}[]"':;!@#$%^&*()-+=~` will break the text into terms. So the text `foo-bar.baz...bag` will be tokenized into `[foo, bar, baz, bag]`
10+
11+
2. Escaping separators in both queries and documents is done by prepending a backslash to any separator. e.g. the text `hello\-world hello-world` will be tokenized as `[hello-world, hello, world]`. **NOTE** that in most languages you will need an extra backslash when formatting the document or query, to signify an actual backslash, so the actual text in redis-cli for example, will be entered as `hello\\-world`.
12+
13+
2. Underscores (`_`) are not used as separators in either document or query. So the text `hello_world` will remain as is after tokenization.
14+
15+
3. Repeating spaces or punctuation marks are stripped.
16+
17+
4. In latin characters, everything gets converted to lowercase.
18+
19+

mkdocs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ pages:
3434
- 'Query Syntax': 'Query_Syntax.md'
3535
- 'Stop-Words': 'Stopwords.md'
3636
- 'Aggregations (NEW!)': 'Aggregations.md'
37-
37+
- 'Tokenization and Escaping': 'Escaping.md'
3838
- 'Sortable Values': 'Sorting.md'
3939
- 'Tag Fields': 'Tags.md'
4040
- 'Highlighting Results': Highlight.md

src/tests/test_tokenize.c

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
#include "test_util.h"
2+
#include "../tokenize.h"
3+
#include "../stemmer.h"
4+
#include "../rmutil/alloc.h"
5+
6+
int testTokenize() {
7+
Stemmer *st = NewStemmer(SnowballStemmer, "english");
8+
9+
RSTokenizer *tk = GetSimpleTokenizer(st, DefaultStopWordList());
10+
char *txt = strdup("hello worlds - - -,,, . . . -=- hello\\-world to be שלום עולם");
11+
const char *expected[] = {"hello", "worlds", "hello-world", "שלום", "עולם"};
12+
const char *stems[] = {NULL, "+world", NULL, NULL, NULL, NULL};
13+
tk->Start(tk, txt, strlen(txt), TOKENIZE_DEFAULT_OPTIONS);
14+
Token tok;
15+
int i = 0;
16+
while (tk->Next(tk, &tok)) {
17+
;
18+
19+
ASSERT(tok.tokLen == strlen(expected[i]));
20+
ASSERT(!strncmp(tok.tok, expected[i], tok.tokLen));
21+
if (!stems[i]) {
22+
ASSERT(tok.stem == NULL);
23+
} else {
24+
ASSERT(!strncmp(tok.stem, stems[i], tok.stemLen));
25+
}
26+
i++;
27+
}
28+
free(txt);
29+
30+
RETURN_TEST_SUCCESS;
31+
}
32+
33+
TEST_MAIN({
34+
RMUTil_InitAlloc();
35+
TESTFUNC(testTokenize);
36+
})

src/tokenize.c

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -46,16 +46,22 @@ static char *DefaultNormalize(char *s, char *dst, size_t *len) {
4646
realDest = dst; \
4747
memcpy(realDest, s, ii); \
4848
}
49-
49+
// set to 1 if the previous character was a backslash escape
50+
int escaped = 0;
5051
for (size_t ii = 0; ii < origLen; ++ii) {
5152
if (isupper(s[ii])) {
5253
SWITCH_DEST();
5354
realDest[dstLen++] = tolower(s[ii]);
54-
} else if (isblank(s[ii]) || iscntrl(s[ii])) {
55+
} else if ((isblank(s[ii]) && !escaped) || iscntrl(s[ii])) {
56+
SWITCH_DEST();
57+
} else if (s[ii] == '\\' && !escaped) {
5558
SWITCH_DEST();
59+
escaped = 1;
60+
continue;
5661
} else {
5762
dst[dstLen++] = s[ii];
5863
}
64+
escaped = 0;
5965
}
6066

6167
*len = dstLen;
@@ -113,7 +119,6 @@ uint32_t simpleTokenizer_Next(RSTokenizer *base, Token *t) {
113119
t->stemLen = sl;
114120
}
115121
}
116-
117122
return ctx->lastOffset;
118123
}
119124

src/toksep.h

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,10 @@
55
#include <stdlib.h>
66
//! " # $ % & ' ( ) * + , - . / : ; < = > ? @ [ \ ] ^ ` { | } ~
77
static const char ToksepMap_g[256] = {
8-
[' '] = 1, ['\t'] = 1, [','] = 1, ['.'] = 1, ['/'] = 1, ['('] = 1, [')'] = 1,
9-
['{'] = 1, ['}'] = 1, ['['] = 1, [']'] = 1, [':'] = 1, [';'] = 1, ['\\'] = 1,
10-
['~'] = 1, ['!'] = 1, ['@'] = 1, ['#'] = 1, ['$'] = 1, ['%'] = 1, ['^'] = 1,
11-
['&'] = 1, ['*'] = 1, ['-'] = 1, ['='] = 1, ['+'] = 1, ['|'] = 1, ['\''] = 1,
12-
['`'] = 1, ['"'] = 1, ['<'] = 1, ['>'] = 1, ['?'] = 1,
8+
[' '] = 1, ['\t'] = 1, [','] = 1, ['.'] = 1, ['/'] = 1, ['('] = 1, [')'] = 1, ['{'] = 1,
9+
['}'] = 1, ['['] = 1, [']'] = 1, [':'] = 1, [';'] = 1, ['~'] = 1, ['!'] = 1, ['@'] = 1,
10+
['#'] = 1, ['$'] = 1, ['%'] = 1, ['^'] = 1, ['&'] = 1, ['*'] = 1, ['-'] = 1, ['='] = 1,
11+
['+'] = 1, ['|'] = 1, ['\''] = 1, ['`'] = 1, ['"'] = 1, ['<'] = 1, ['>'] = 1, ['?'] = 1,
1312
};
1413

1514
/**
@@ -20,7 +19,7 @@ static inline char *toksep(char **s, size_t *tokLen) {
2019
uint8_t *pos = (uint8_t *)*s;
2120
char *orig = *s;
2221
for (; *pos; ++pos) {
23-
if (ToksepMap_g[*pos]) {
22+
if (ToksepMap_g[*pos] && ((char *)pos == orig || *(pos - 1) != '\\')) {
2423
*s = (char *)++pos;
2524
*tokLen = ((char *)pos - orig) - 1;
2625
if (!*pos) {

0 commit comments

Comments
 (0)