Skip to content

Commit ac52836

Browse files
authored
Merge pull request #260 from gregjacobs/linear-time-email-matcher
Linear time email matcher
2 parents fe79604 + 9942278 commit ac52836

File tree

7 files changed

+430
-66
lines changed

7 files changed

+430
-66
lines changed

gulpfile.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ function buildSrcMinifyUmdTask() {
293293
async function buildSrcCheckMinifiedSizeTask() {
294294
const stats = await fs.stat( './dist/Autolinker.min.js' );
295295
const sizeInKb = stats.size / 1000;
296-
const maxExpectedSizeInKb = 44;
296+
const maxExpectedSizeInKb = 46;
297297

298298
if( sizeInKb > maxExpectedSizeInKb ) {
299299
throw new Error( `

src/htmlParser/parse-html.ts

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import { State } from './state';
2+
import { letterRe, digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
3+
import { throwUnhandledCaseError } from '../utils';
24

35
// For debugging: search for other "For debugging" lines
46
// import CliTable from 'cli-table';
@@ -61,12 +63,7 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
6163
onComment: ( offset: number ) => void;
6264
onDoctype: ( offset: number ) => void;
6365
} ) {
64-
const letterRe = /[A-Za-z]/,
65-
digitRe = /[0-9]/,
66-
whitespaceRe = /\s/,
67-
quoteRe = /['"]/,
68-
controlCharsRe = /[\x00-\x1F\x7F]/, // control chars (0-31), and the backspace char (127)
69-
noCurrentTag = new CurrentTag();
66+
const noCurrentTag = new CurrentTag();
7067

7168
let charIdx = 0,
7269
len = html.length,
@@ -112,7 +109,7 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
112109
case State.Doctype: stateDoctype( char ); break;
113110

114111
default:
115-
throwUnhandledStateError( state );
112+
throwUnhandledCaseError( state );
116113
}
117114

118115
// For debugging: search for other "For debugging" lines
@@ -131,14 +128,6 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
131128
//console.log( '\n' + table.toString() );
132129

133130

134-
/**
135-
* Function that should never be called but is used to check that every
136-
* enum value is handled using TypeScript's 'never' type.
137-
*/
138-
function throwUnhandledStateError( state: never ) {
139-
throw new Error( 'Unhandled State' )
140-
}
141-
142131

143132
// Called when non-tags are being read (i.e. the text around HTML †ags)
144133
// https://www.w3.org/TR/html51/syntax.html#data-state

src/matcher/email-matcher.ts

Lines changed: 232 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import { Matcher } from "./matcher";
2-
import { alphaNumericAndMarksCharsStr, getDomainNameStr } from "../regex-lib";
3-
import { tldRegex } from "./tld-regex";
2+
import { alphaNumericAndMarksCharsStr, domainNameCharRegex } from "../regex-lib";
43
import { EmailMatch } from "../match/email-match";
54
import { Match } from "../match/match";
5+
import { throwUnhandledCaseError } from '../utils';
6+
7+
// For debugging: search for other "For debugging" lines
8+
// import CliTable from 'cli-table';
69

710
/**
811
* @class Autolinker.matcher.Email
@@ -15,49 +18,245 @@ import { Match } from "../match/match";
1518
export class EmailMatcher extends Matcher {
1619

1720
/**
18-
* The regular expression to match email addresses. Example match:
19-
*
20-
21-
*
22-
* @protected
23-
* @property {RegExp} matcherRegex
21+
* Valid characters that can be used in the "local" part of an email address,
22+
* i.e. the "name" part of "[email protected]"
2423
*/
25-
protected matcherRegex = (function() {
26-
var specialCharacters = '!#$%&\'*+\\-\\/=?^_`{|}~',
27-
restrictedSpecialCharacters = '\\s"(),:;<>@\\[\\]',
28-
validCharacters = alphaNumericAndMarksCharsStr + specialCharacters,
29-
validRestrictedCharacters = validCharacters + restrictedSpecialCharacters,
30-
emailRegex = new RegExp( '(?:[' + validCharacters + '](?:[' + validCharacters + ']|\\.(?!\\.|@))*|\\"[' + validRestrictedCharacters + '.]+\\")@');
31-
32-
return new RegExp( [
33-
emailRegex.source,
34-
getDomainNameStr( 1 ),
35-
'\\.', tldRegex.source // '.com', '.net', etc
36-
].join( "" ), 'gi' );
37-
} )();
24+
protected localPartCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}!#$%&'*+/=?^_\`{|}~-]` );
3825

3926

4027
/**
4128
* @inheritdoc
4229
*/
4330
parseMatches( text: string ) {
44-
let matcherRegex = this.matcherRegex,
45-
tagBuilder = this.tagBuilder,
46-
matches: Match[] = [],
47-
match: RegExpExecArray | null;
31+
const tagBuilder = this.tagBuilder,
32+
localPartCharRegex = this.localPartCharRegex,
33+
matches: Match[] = [],
34+
len = text.length,
35+
noCurrentEmailAddress = new CurrentEmailAddress();
36+
37+
let charIdx = 0,
38+
state = State.NonEmailAddress as State,
39+
currentEmailAddress = noCurrentEmailAddress;
40+
41+
// For debugging: search for other "For debugging" lines
42+
// const table = new CliTable( {
43+
// head: [ 'charIdx', 'char', 'state', 'charIdx', 'currentEmailAddress.idx', 'hasDomainDot' ]
44+
// } );
4845

49-
while( ( match = matcherRegex.exec( text ) ) !== null ) {
50-
let matchedText = match[ 0 ];
46+
while( charIdx < len ) {
47+
const char = text.charAt( charIdx );
5148

52-
matches.push( new EmailMatch( {
53-
tagBuilder : tagBuilder,
54-
matchedText : matchedText,
55-
offset : match.index,
56-
email : matchedText
57-
} ) );
49+
// For debugging: search for other "For debugging" lines
50+
// table.push(
51+
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
52+
// );
53+
54+
switch( state ) {
55+
case State.NonEmailAddress: stateNonEmailAddress( char ); break;
56+
case State.LocalPart: stateLocalPart( char ); break;
57+
case State.LocalPartDot: stateLocalPartDot( char ); break;
58+
case State.AtSign: stateAtSign( char ); break;
59+
case State.DomainChar: stateDomainChar( char ); break;
60+
case State.DomainHyphen: stateDomainHyphen( char ); break;
61+
case State.DomainDot: stateDomainDot( char ); break;
62+
63+
default:
64+
throwUnhandledCaseError( state );
65+
}
66+
67+
// For debugging: search for other "For debugging" lines
68+
// table.push(
69+
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
70+
// );
71+
72+
charIdx++;
5873
}
5974

75+
// Capture any valid match at the end of the string
76+
captureMatchIfValidAndReset();
77+
78+
// For debugging: search for other "For debugging" lines
79+
//console.log( '\n' + table.toString() );
80+
6081
return matches;
82+
83+
84+
// Handles the state when we're not in an email address
85+
function stateNonEmailAddress( char: string ) {
86+
if( localPartCharRegex.test( char ) ) {
87+
beginEmailAddress();
88+
89+
} else {
90+
// not an email address character, continue
91+
}
92+
}
93+
94+
95+
// Handles the state when we're currently in the "local part" of an
96+
// email address (as opposed to the "domain part")
97+
function stateLocalPart( char: string ) {
98+
if( char === '.' ) {
99+
state = State.LocalPartDot;
100+
101+
} else if( char === '@' ) {
102+
state = State.AtSign;
103+
104+
} else if( localPartCharRegex.test( char ) ) {
105+
// stay in the "local part" of the email address
106+
107+
} else {
108+
// not an email address character, return to "NonEmailAddress" state
109+
resetToNonEmailAddressState();
110+
}
111+
}
112+
113+
114+
// Handles the state where we've read
115+
function stateLocalPartDot( char: string ) {
116+
if( char === '.' ) {
117+
// We read a second '.' in a row, not a valid email address
118+
// local part
119+
resetToNonEmailAddressState();
120+
121+
} else if( char === '@' ) {
122+
// We read the '@' character immediately after a dot ('.'), not
123+
// an email address
124+
resetToNonEmailAddressState();
125+
126+
} else if( localPartCharRegex.test( char ) ) {
127+
state = State.LocalPart;
128+
129+
} else {
130+
// Anything else, not an email address
131+
resetToNonEmailAddressState();
132+
}
133+
}
134+
135+
136+
function stateAtSign( char: string ) {
137+
if( domainNameCharRegex.test( char ) ) {
138+
state = State.DomainChar;
139+
140+
} else {
141+
// Anything else, not an email address
142+
resetToNonEmailAddressState();
143+
}
144+
}
145+
146+
function stateDomainChar( char: string ) {
147+
if( char === '.' ) {
148+
state = State.DomainDot;
149+
150+
} else if( char === '-' ) {
151+
state = State.DomainHyphen;
152+
153+
} else if( domainNameCharRegex.test( char ) ) {
154+
// Stay in the DomainChar state
155+
156+
} else {
157+
// Anything else, we potentially matched if the criteria has
158+
// been met
159+
captureMatchIfValidAndReset();
160+
}
161+
}
162+
163+
function stateDomainHyphen( char: string ) {
164+
if( char === '-' || char === '.' ) {
165+
// Not valid to have two hyphens ("--") or hypen+dot ("-.")
166+
captureMatchIfValidAndReset();
167+
168+
} else if( domainNameCharRegex.test( char ) ) {
169+
state = State.DomainChar;
170+
171+
} else {
172+
// Anything else
173+
captureMatchIfValidAndReset();
174+
}
175+
}
176+
177+
function stateDomainDot( char: string ) {
178+
if( char === '.' || char === '-' ) {
179+
// not valid to have two dots ("..") or dot+hypen (".-")
180+
captureMatchIfValidAndReset();
181+
182+
} else if( domainNameCharRegex.test( char ) ) {
183+
state = State.DomainChar;
184+
185+
// After having read a '.' and then a valid domain character,
186+
// we now know that the domain part of the email is valid, and
187+
// we have found at least a partial EmailMatch (however, the
188+
// email address may have additional characters from this point)
189+
currentEmailAddress = new CurrentEmailAddress( {
190+
...currentEmailAddress,
191+
hasDomainDot: true
192+
} );
193+
194+
} else {
195+
// Anything else
196+
captureMatchIfValidAndReset();
197+
}
198+
}
199+
200+
201+
function beginEmailAddress() {
202+
state = State.LocalPart;
203+
currentEmailAddress = new CurrentEmailAddress( { idx: charIdx } );
204+
}
205+
206+
function resetToNonEmailAddressState() {
207+
state = State.NonEmailAddress;
208+
currentEmailAddress = noCurrentEmailAddress
209+
}
210+
211+
212+
/*
213+
* Captures the current email address as an EmailMatch if it's valid,
214+
* and resets the state to read another email address.
215+
*/
216+
function captureMatchIfValidAndReset() {
217+
if( currentEmailAddress.hasDomainDot ) { // we need at least one dot in the domain to be considered a valid email address
218+
let emailAddress = text.slice( currentEmailAddress.idx, charIdx );
219+
220+
// If we read a '.' or '-' char that ended the email address
221+
// (valid domain name characters, but only valid email address
222+
// characters if they are followed by something else), strip
223+
// it off now
224+
if( /[-.]$/.test( emailAddress ) ){
225+
emailAddress = emailAddress.slice( 0, -1 );
226+
}
227+
228+
matches.push( new EmailMatch( {
229+
tagBuilder : tagBuilder,
230+
matchedText : emailAddress,
231+
offset : currentEmailAddress.idx,
232+
email : emailAddress
233+
} ) );
234+
}
235+
236+
resetToNonEmailAddressState();
237+
}
61238
}
62239

63240
}
241+
242+
243+
const enum State {
244+
NonEmailAddress = 0,
245+
LocalPart,
246+
LocalPartDot,
247+
AtSign,
248+
DomainChar,
249+
DomainHyphen,
250+
DomainDot
251+
}
252+
253+
254+
class CurrentEmailAddress {
255+
readonly idx: number; // the index of the first character in the email address
256+
readonly hasDomainDot: boolean;
257+
258+
constructor( cfg: Partial<CurrentEmailAddress> = {} ) {
259+
this.idx = cfg.idx !== undefined ? cfg.idx : -1;
260+
this.hasDomainDot = !!cfg.hasDomainDot;
261+
}
262+
}

src/regex-lib.ts

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,32 @@
66
* regular expressions that are shared between source files.
77
*/
88

9+
/**
10+
* Regular expression to match upper and lowercase ASCII letters
11+
*/
12+
export const letterRe = /[A-Za-z]/;
13+
14+
/**
15+
* Regular expression to match ASCII digits
16+
*/
17+
export const digitRe = /[0-9]/;
18+
19+
/**
20+
* Regular expression to match whitespace
21+
*/
22+
export const whitespaceRe = /\s/;
23+
24+
/**
25+
* Regular expression to match quote characters
26+
*/
27+
export const quoteRe = /['"]/;
28+
29+
/**
30+
* Regular expression to match the range of ASCII control characters (0-31), and
31+
* the backspace char (127)
32+
*/
33+
export const controlCharsRe = /[\x00-\x1F\x7F]/;
34+
935
/**
1036
* The string form of a regular expression that would match all of the
1137
* alphabetic ("letter") chars in the unicode character set when placed in a
@@ -142,3 +168,10 @@ export const getDomainNameStr = ( group: number ) => {
142168
* Ex: 'google', 'yahoo', 'some-other-company', etc.
143169
*/
144170
export const domainNameRegex = new RegExp( '[' + alphaNumericAndMarksCharsStr + '.\\-]*[' + alphaNumericAndMarksCharsStr + '\\-]' );
171+
172+
173+
/**
174+
* A regular expression that is simply the character class of the characters
175+
* that may be used in a domain name, minus the '-' or '.'
176+
*/
177+
export const domainNameCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}]` );

0 commit comments

Comments
 (0)