Skip to content

Commit 0087799

Browse files
committed
Replace regex-based email matcher with state machine parser
1 parent 44d416f commit 0087799

File tree

7 files changed

+394
-60
lines changed

7 files changed

+394
-60
lines changed

gulpfile.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -293,7 +293,7 @@ function buildSrcMinifyUmdTask() {
293293
async function buildSrcCheckMinifiedSizeTask() {
294294
const stats = await fs.stat( './dist/Autolinker.min.js' );
295295
const sizeInKb = stats.size / 1000;
296-
const maxExpectedSizeInKb = 44;
296+
const maxExpectedSizeInKb = 46;
297297

298298
if( sizeInKb > maxExpectedSizeInKb ) {
299299
throw new Error( `

src/htmlParser/parse-html.ts

Lines changed: 2 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import { State } from './state';
22
import { letterRe, digitRe, whitespaceRe, quoteRe, controlCharsRe } from '../regex-lib';
3+
import { throwUnhandledCaseError } from '../utils';
34

45
// For debugging: search for other "For debugging" lines
56
// import CliTable from 'cli-table';
@@ -108,7 +109,7 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
108109
case State.Doctype: stateDoctype( char ); break;
109110

110111
default:
111-
throwUnhandledStateError( state );
112+
throwUnhandledCaseError( state );
112113
}
113114

114115
// For debugging: search for other "For debugging" lines
@@ -127,14 +128,6 @@ export function parseHtml( html: string, { onOpenTag, onCloseTag, onText, onComm
127128
//console.log( '\n' + table.toString() );
128129

129130

130-
/**
131-
* Function that should never be called but is used to check that every
132-
* enum value is handled using TypeScript's 'never' type.
133-
*/
134-
function throwUnhandledStateError( state: never ) {
135-
throw new Error( 'Unhandled State' )
136-
}
137-
138131

139132
// Called when non-tags are being read (i.e. the text around HTML †ags)
140133
// https://www.w3.org/TR/html51/syntax.html#data-state

src/matcher/email-matcher.ts

Lines changed: 232 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
import { Matcher } from "./matcher";
2-
import { alphaNumericAndMarksCharsStr, getDomainNameStr } from "../regex-lib";
3-
import { tldRegex } from "./tld-regex";
2+
import { alphaNumericAndMarksCharsStr, domainNameCharRegex } from "../regex-lib";
43
import { EmailMatch } from "../match/email-match";
54
import { Match } from "../match/match";
5+
import { throwUnhandledCaseError } from '../utils';
6+
7+
// For debugging: search for other "For debugging" lines
8+
// import CliTable from 'cli-table';
69

710
/**
811
* @class Autolinker.matcher.Email
@@ -15,49 +18,245 @@ import { Match } from "../match/match";
1518
export class EmailMatcher extends Matcher {
1619

1720
/**
18-
* The regular expression to match email addresses. Example match:
19-
*
20-
21-
*
22-
* @protected
23-
* @property {RegExp} matcherRegex
21+
* Valid characters that can be used in the "local" part of an email address,
22+
* i.e. the "name" part of "[email protected]"
2423
*/
25-
protected matcherRegex = (function() {
26-
var specialCharacters = '!#$%&\'*+\\-\\/=?^_`{|}~',
27-
restrictedSpecialCharacters = '\\s"(),:;<>@\\[\\]',
28-
validCharacters = alphaNumericAndMarksCharsStr + specialCharacters,
29-
validRestrictedCharacters = validCharacters + restrictedSpecialCharacters,
30-
emailRegex = new RegExp( '(?:[' + validCharacters + '](?:[' + validCharacters + ']|\\.(?!\\.|@))*|\\"[' + validRestrictedCharacters + '.]+\\")@');
31-
32-
return new RegExp( [
33-
emailRegex.source,
34-
getDomainNameStr( 1 ),
35-
'\\.', tldRegex.source // '.com', '.net', etc
36-
].join( "" ), 'gi' );
37-
} )();
24+
protected localPartCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}!#$%&'*+/=?^_\`{|}~-]` );
3825

3926

4027
/**
4128
* @inheritdoc
4229
*/
4330
parseMatches( text: string ) {
44-
let matcherRegex = this.matcherRegex,
45-
tagBuilder = this.tagBuilder,
46-
matches: Match[] = [],
47-
match: RegExpExecArray | null;
31+
const tagBuilder = this.tagBuilder,
32+
localPartCharRegex = this.localPartCharRegex,
33+
matches: Match[] = [],
34+
len = text.length,
35+
noCurrentEmailAddress = new CurrentEmailAddress();
36+
37+
let charIdx = 0,
38+
state = State.NonEmailAddress as State,
39+
currentEmailAddress = noCurrentEmailAddress;
40+
41+
// For debugging: search for other "For debugging" lines
42+
// const table = new CliTable( {
43+
// head: [ 'charIdx', 'char', 'state', 'charIdx', 'currentEmailAddress.idx', 'hasDomainDot' ]
44+
// } );
4845

49-
while( ( match = matcherRegex.exec( text ) ) !== null ) {
50-
let matchedText = match[ 0 ];
46+
while( charIdx < len ) {
47+
const char = text.charAt( charIdx );
5148

52-
matches.push( new EmailMatch( {
53-
tagBuilder : tagBuilder,
54-
matchedText : matchedText,
55-
offset : match.index,
56-
email : matchedText
57-
} ) );
49+
// For debugging: search for other "For debugging" lines
50+
// table.push(
51+
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
52+
// );
53+
54+
switch( state ) {
55+
case State.NonEmailAddress: stateNonEmailAddress( char ); break;
56+
case State.LocalPart: stateLocalPart( char ); break;
57+
case State.LocalPartDot: stateLocalPartDot( char ); break;
58+
case State.AtSign: stateAtSign( char ); break;
59+
case State.DomainChar: stateDomainChar( char ); break;
60+
case State.DomainHyphen: stateDomainHyphen( char ); break;
61+
case State.DomainDot: stateDomainDot( char ); break;
62+
63+
default:
64+
throwUnhandledCaseError( state );
65+
}
66+
67+
// For debugging: search for other "For debugging" lines
68+
// table.push(
69+
// [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
70+
// );
71+
72+
charIdx++;
5873
}
5974

75+
// Capture any valid match at the end of the string
76+
captureMatchIfValidAndReset();
77+
78+
// For debugging: search for other "For debugging" lines
79+
//console.log( '\n' + table.toString() );
80+
6081
return matches;
82+
83+
84+
// Handles the state when we're not in an email address
85+
function stateNonEmailAddress( char: string ) {
86+
if( localPartCharRegex.test( char ) ) {
87+
beginEmailAddress();
88+
89+
} else {
90+
// not an email address character, continue
91+
}
92+
}
93+
94+
95+
// Handles the state when we're currently in the "local part" of an
96+
// email address (as opposed to the "domain part")
97+
function stateLocalPart( char: string ) {
98+
if( char === '.' ) {
99+
state = State.LocalPartDot;
100+
101+
} else if( char === '@' ) {
102+
state = State.AtSign;
103+
104+
} else if( localPartCharRegex.test( char ) ) {
105+
// stay in the "local part" of the email address
106+
107+
} else {
108+
// not an email address character, return to "NonEmailAddress" state
109+
resetToNonEmailAddressState();
110+
}
111+
}
112+
113+
114+
// Handles the state where we've read
115+
function stateLocalPartDot( char: string ) {
116+
if( char === '.' ) {
117+
// We read a second '.' in a row, not a valid email address
118+
// local part
119+
resetToNonEmailAddressState();
120+
121+
} else if( char === '@' ) {
122+
// We read the '@' character immediately after a dot ('.'), not
123+
// an email address
124+
resetToNonEmailAddressState();
125+
126+
} else if( localPartCharRegex.test( char ) ) {
127+
state = State.LocalPart;
128+
129+
} else {
130+
// Anything else, not an email address
131+
resetToNonEmailAddressState();
132+
}
133+
}
134+
135+
136+
function stateAtSign( char: string ) {
137+
if( domainNameCharRegex.test( char ) ) {
138+
state = State.DomainChar;
139+
140+
} else {
141+
// Anything else, not an email address
142+
resetToNonEmailAddressState();
143+
}
144+
}
145+
146+
function stateDomainChar( char: string ) {
147+
if( char === '.' ) {
148+
state = State.DomainDot;
149+
150+
} else if( char === '-' ) {
151+
state = State.DomainHyphen;
152+
153+
} else if( domainNameCharRegex.test( char ) ) {
154+
// Stay in the DomainChar state
155+
156+
} else {
157+
// Anything else, we potentially matched if the criteria has
158+
// been met
159+
captureMatchIfValidAndReset();
160+
}
161+
}
162+
163+
function stateDomainHyphen( char: string ) {
164+
if( char === '-' || char === '.' ) {
165+
// Not valid to have two hyphens ("--") or hypen+dot ("-.")
166+
captureMatchIfValidAndReset();
167+
168+
} else if( domainNameCharRegex.test( char ) ) {
169+
state = State.DomainChar;
170+
171+
} else {
172+
// Anything else
173+
captureMatchIfValidAndReset();
174+
}
175+
}
176+
177+
function stateDomainDot( char: string ) {
178+
if( char === '.' || char === '-' ) {
179+
// not valid to have two dots ("..") or dot+hypen (".-")
180+
captureMatchIfValidAndReset();
181+
182+
} else if( domainNameCharRegex.test( char ) ) {
183+
state = State.DomainChar;
184+
185+
// After having read a '.' and then a valid domain character,
186+
// we now know that the domain part of the email is valid, and
187+
// we have found at least a partial EmailMatch (however, the
188+
// email address may have additional characters from this point)
189+
currentEmailAddress = new CurrentEmailAddress( {
190+
...currentEmailAddress,
191+
hasDomainDot: true
192+
} );
193+
194+
} else {
195+
// Anything else
196+
captureMatchIfValidAndReset();
197+
}
198+
}
199+
200+
201+
function beginEmailAddress() {
202+
state = State.LocalPart;
203+
currentEmailAddress = new CurrentEmailAddress( { idx: charIdx } );
204+
}
205+
206+
function resetToNonEmailAddressState() {
207+
state = State.NonEmailAddress;
208+
currentEmailAddress = noCurrentEmailAddress
209+
}
210+
211+
212+
/*
213+
* Captures the current email address as an EmailMatch if it's valid,
214+
* and resets the state to read another email address.
215+
*/
216+
function captureMatchIfValidAndReset() {
217+
if( currentEmailAddress.hasDomainDot ) { // we need at least one dot in the domain to be considered a valid email address
218+
let emailAddress = text.slice( currentEmailAddress.idx, charIdx );
219+
220+
// If we read a '.' or '-' char that ended the email address
221+
// (valid domain name characters, but only valid email address
222+
// characters if they are followed by something else), strip
223+
// it off now
224+
if( /[-.]$/.test( emailAddress ) ){
225+
emailAddress = emailAddress.slice( 0, -1 );
226+
}
227+
228+
matches.push( new EmailMatch( {
229+
tagBuilder : tagBuilder,
230+
matchedText : emailAddress,
231+
offset : currentEmailAddress.idx,
232+
email : emailAddress
233+
} ) );
234+
}
235+
236+
resetToNonEmailAddressState();
237+
}
61238
}
62239

63240
}
241+
242+
243+
const enum State {
244+
NonEmailAddress = 0,
245+
LocalPart,
246+
LocalPartDot,
247+
AtSign,
248+
DomainChar,
249+
DomainHyphen,
250+
DomainDot
251+
}
252+
253+
254+
class CurrentEmailAddress {
255+
readonly idx: number; // the index of the first character in the email address
256+
readonly hasDomainDot: boolean;
257+
258+
constructor( cfg: Partial<CurrentEmailAddress> = {} ) {
259+
this.idx = cfg.idx !== undefined ? cfg.idx : -1;
260+
this.hasDomainDot = !!cfg.hasDomainDot;
261+
}
262+
}

src/regex-lib.ts

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -168,3 +168,10 @@ export const getDomainNameStr = ( group: number ) => {
168168
* Ex: 'google', 'yahoo', 'some-other-company', etc.
169169
*/
170170
export const domainNameRegex = new RegExp( '[' + alphaNumericAndMarksCharsStr + '.\\-]*[' + alphaNumericAndMarksCharsStr + '\\-]' );
171+
172+
173+
/**
174+
* A regular expression that is simply the character class of the characters
175+
* that may be used in a domain name, minus the '-' or '.'
176+
*/
177+
export const domainNameCharRegex = new RegExp( `[${alphaNumericAndMarksCharsStr}]` );

src/utils.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,3 +127,12 @@ export function splitAndCapture( str: string, splitRegex: RegExp ) {
127127

128128
return result;
129129
}
130+
131+
132+
/**
133+
* Function that should never be called but is used to check that every
134+
* enum value is handled using TypeScript's 'never' type.
135+
*/
136+
export function throwUnhandledCaseError( theValue: never ) {
137+
throw new Error( `Unhandled case for value: '${theValue}'` );
138+
}

tests/autolinker-email.spec.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,13 @@ describe( "Autolinker Email Matching -", () => {
3333
} );
3434

3535

36+
it( "should automatically link email addresses with a period at the end of a sentence (but not include the period)", function() {
37+
let result = autolinker.link( "Joe's email is [email protected]. Try emailing him" );
38+
39+
expect( result ).toBe( 'Joe\'s email is <a href="mailto:[email protected]">[email protected]</a>. Try emailing him' );
40+
} );
41+
42+
3643
it( "should automatically link email addresses with a period in the 'local part'", function() {
3744
let result = autolinker.link( "Joe's email is [email protected]" );
3845

@@ -88,6 +95,7 @@ describe( "Autolinker Email Matching -", () => {
8895
expect( result ).toBe( 'Hi there@stuff' );
8996
} );
9097

98+
9199
it( "should automatically link an email address with tld matched localpart", function () {
92100
let result = autolinker.link( "My email is [email protected]" );
93101

0 commit comments

Comments
 (0)