|
1 | 1 | import Diff from './base.js'; |
2 | | -import type { ChangeObject, CallbackOptionAbortable, CallbackOptionNonabortable, DiffCallbackNonabortable, DiffSentencesOptionsAbortable, DiffSentencesOptionsNonabortable} from '../types.js'; |
| 2 | +import type { |
| 3 | + ChangeObject, |
| 4 | + CallbackOptionAbortable, |
| 5 | + CallbackOptionNonabortable, |
| 6 | + DiffCallbackNonabortable, |
| 7 | + DiffSentencesOptionsAbortable, |
| 8 | + DiffSentencesOptionsNonabortable |
| 9 | +} from '../types.js'; |
| 10 | + |
| 11 | +function isSentenceEndPunct(char: string) { |
| 12 | + return char == '.' || char == '!' || char == '?'; |
| 13 | +} |
3 | 14 |
|
4 | 15 | class SentenceDiff extends Diff<string, string> { |
5 | 16 | tokenize(value: string) { |
6 | | - return value.split(/(?<=[.!?])(\s+|$)/); |
| 17 | + // If in future we drop support for environments that don't support lookbehinds, we can replace |
| 18 | + // this entire function with: |
| 19 | + // return value.split(/(?<=[.!?])(\s+|$)/); |
| 20 | + // but until then, for similar reasons to the trailingWs function in string.ts, we are forced |
| 21 | + // to do this verbosely "by hand" instead of using a regex. |
| 22 | + const result = []; |
| 23 | + let tokenStartI = 0; |
| 24 | + for (let i = 0; i < value.length; i++) { |
| 25 | + if (i == value.length - 1) { |
| 26 | + result.push(value.slice(tokenStartI)); |
| 27 | + break; |
| 28 | + } |
| 29 | + |
| 30 | + if (isSentenceEndPunct(value[i]) && value[i + 1].match(/\s/)) { |
| 31 | + // We've hit a sentence break - i.e. a punctuation mark followed by whitespace. |
| 32 | + // We now want to push TWO tokens to the result: |
| 33 | + // 1. the sentence |
| 34 | + result.push(value.slice(tokenStartI, i + 1)); |
| 35 | + |
| 36 | + // 2. the whitespace |
| 37 | + i = tokenStartI = i + 1; |
| 38 | + while (value[i + 1]?.match(/\s/)) { |
| 39 | + i++; |
| 40 | + } |
| 41 | + result.push(value.slice(tokenStartI, i + 1)); |
| 42 | + |
| 43 | + // Then the next token (a sentence) starts on the character after the whitespace. |
| 44 | + // (It's okay if this is off the end of the string - then the outer loop will terminate |
| 45 | + // here anyway.) |
| 46 | + tokenStartI = i + 1; |
| 47 | + } |
| 48 | + } |
| 49 | + |
| 50 | + return result; |
7 | 51 | } |
8 | 52 | } |
9 | 53 |
|
|
0 commit comments