11import { Matcher } from "./matcher" ;
2- import { alphaNumericAndMarksCharsStr , getDomainNameStr } from "../regex-lib" ;
3- import { tldRegex } from "./tld-regex" ;
2+ import { alphaNumericAndMarksCharsStr , domainNameCharRegex } from "../regex-lib" ;
43import { EmailMatch } from "../match/email-match" ;
54import { Match } from "../match/match" ;
5+ import { throwUnhandledCaseError } from '../utils' ;
6+
7+ // For debugging: search for other "For debugging" lines
8+ // import CliTable from 'cli-table';
69
710/**
811 * @class Autolinker.matcher.Email
@@ -15,49 +18,245 @@ import { Match } from "../match/match";
1518export class EmailMatcher extends Matcher {
1619
1720 /**
18- * The regular expression to match email addresses. Example match:
19- *
20- 21- *
22- * @protected
23- * @property {RegExp } matcherRegex
21+ * Valid characters that can be used in the "local" part of an email address,
22+ * i.e. the "name" part of "[email protected] " 2423 */
25- protected matcherRegex = ( function ( ) {
26- var specialCharacters = '!#$%&\'*+\\-\\/=?^_`{|}~' ,
27- restrictedSpecialCharacters = '\\s"(),:;<>@\\[\\]' ,
28- validCharacters = alphaNumericAndMarksCharsStr + specialCharacters ,
29- validRestrictedCharacters = validCharacters + restrictedSpecialCharacters ,
30- emailRegex = new RegExp ( '(?:[' + validCharacters + '](?:[' + validCharacters + ']|\\.(?!\\.|@))*|\\"[' + validRestrictedCharacters + '.]+\\")@' ) ;
31-
32- return new RegExp ( [
33- emailRegex . source ,
34- getDomainNameStr ( 1 ) ,
35- '\\.' , tldRegex . source // '.com', '.net', etc
36- ] . join ( "" ) , 'gi' ) ;
37- } ) ( ) ;
24+ protected localPartCharRegex = new RegExp ( `[${ alphaNumericAndMarksCharsStr } !#$%&'*+/=?^_\`{|}~-]` ) ;
3825
3926
4027 /**
4128 * @inheritdoc
4229 */
4330 parseMatches ( text : string ) {
44- let matcherRegex = this . matcherRegex ,
45- tagBuilder = this . tagBuilder ,
46- matches : Match [ ] = [ ] ,
47- match : RegExpExecArray | null ;
31+ const tagBuilder = this . tagBuilder ,
32+ localPartCharRegex = this . localPartCharRegex ,
33+ matches : Match [ ] = [ ] ,
34+ len = text . length ,
35+ noCurrentEmailAddress = new CurrentEmailAddress ( ) ;
36+
37+ let charIdx = 0 ,
38+ state = State . NonEmailAddress as State ,
39+ currentEmailAddress = noCurrentEmailAddress ;
40+
41+ // For debugging: search for other "For debugging" lines
42+ // const table = new CliTable( {
43+ // head: [ 'charIdx', 'char', 'state', 'charIdx', 'currentEmailAddress.idx', 'hasDomainDot' ]
44+ // } );
4845
49- while ( ( match = matcherRegex . exec ( text ) ) !== null ) {
50- let matchedText = match [ 0 ] ;
46+ while ( charIdx < len ) {
47+ const char = text . charAt ( charIdx ) ;
5148
52- matches . push ( new EmailMatch ( {
53- tagBuilder : tagBuilder ,
54- matchedText : matchedText ,
55- offset : match . index ,
56- email : matchedText
57- } ) ) ;
49+ // For debugging: search for other "For debugging" lines
50+ // table.push(
51+ // [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
52+ // );
53+
54+ switch ( state ) {
55+ case State . NonEmailAddress : stateNonEmailAddress ( char ) ; break ;
56+ case State . LocalPart : stateLocalPart ( char ) ; break ;
57+ case State . LocalPartDot : stateLocalPartDot ( char ) ; break ;
58+ case State . AtSign : stateAtSign ( char ) ; break ;
59+ case State . DomainChar : stateDomainChar ( char ) ; break ;
60+ case State . DomainHyphen : stateDomainHyphen ( char ) ; break ;
61+ case State . DomainDot : stateDomainDot ( char ) ; break ;
62+
63+ default :
64+ throwUnhandledCaseError ( state ) ;
65+ }
66+
67+ // For debugging: search for other "For debugging" lines
68+ // table.push(
69+ // [ charIdx, char, State[ state ], charIdx, currentEmailAddress.idx, currentEmailAddress.hasDomainDot ]
70+ // );
71+
72+ charIdx ++ ;
5873 }
5974
75+ // Capture any valid match at the end of the string
76+ captureMatchIfValidAndReset ( ) ;
77+
78+ // For debugging: search for other "For debugging" lines
79+ //console.log( '\n' + table.toString() );
80+
6081 return matches ;
82+
83+
84+ // Handles the state when we're not in an email address
85+ function stateNonEmailAddress ( char : string ) {
86+ if ( localPartCharRegex . test ( char ) ) {
87+ beginEmailAddress ( ) ;
88+
89+ } else {
90+ // not an email address character, continue
91+ }
92+ }
93+
94+
95+ // Handles the state when we're currently in the "local part" of an
96+ // email address (as opposed to the "domain part")
97+ function stateLocalPart ( char : string ) {
98+ if ( char === '.' ) {
99+ state = State . LocalPartDot ;
100+
101+ } else if ( char === '@' ) {
102+ state = State . AtSign ;
103+
104+ } else if ( localPartCharRegex . test ( char ) ) {
105+ // stay in the "local part" of the email address
106+
107+ } else {
108+ // not an email address character, return to "NonEmailAddress" state
109+ resetToNonEmailAddressState ( ) ;
110+ }
111+ }
112+
113+
114+ // Handles the state where we've read
115+ function stateLocalPartDot ( char : string ) {
116+ if ( char === '.' ) {
117+ // We read a second '.' in a row, not a valid email address
118+ // local part
119+ resetToNonEmailAddressState ( ) ;
120+
121+ } else if ( char === '@' ) {
122+ // We read the '@' character immediately after a dot ('.'), not
123+ // an email address
124+ resetToNonEmailAddressState ( ) ;
125+
126+ } else if ( localPartCharRegex . test ( char ) ) {
127+ state = State . LocalPart ;
128+
129+ } else {
130+ // Anything else, not an email address
131+ resetToNonEmailAddressState ( ) ;
132+ }
133+ }
134+
135+
136+ function stateAtSign ( char : string ) {
137+ if ( domainNameCharRegex . test ( char ) ) {
138+ state = State . DomainChar ;
139+
140+ } else {
141+ // Anything else, not an email address
142+ resetToNonEmailAddressState ( ) ;
143+ }
144+ }
145+
146+ function stateDomainChar ( char : string ) {
147+ if ( char === '.' ) {
148+ state = State . DomainDot ;
149+
150+ } else if ( char === '-' ) {
151+ state = State . DomainHyphen ;
152+
153+ } else if ( domainNameCharRegex . test ( char ) ) {
154+ // Stay in the DomainChar state
155+
156+ } else {
157+ // Anything else, we potentially matched if the criteria has
158+ // been met
159+ captureMatchIfValidAndReset ( ) ;
160+ }
161+ }
162+
163+ function stateDomainHyphen ( char : string ) {
164+ if ( char === '-' || char === '.' ) {
165+ // Not valid to have two hyphens ("--") or hypen+dot ("-.")
166+ captureMatchIfValidAndReset ( ) ;
167+
168+ } else if ( domainNameCharRegex . test ( char ) ) {
169+ state = State . DomainChar ;
170+
171+ } else {
172+ // Anything else
173+ captureMatchIfValidAndReset ( ) ;
174+ }
175+ }
176+
177+ function stateDomainDot ( char : string ) {
178+ if ( char === '.' || char === '-' ) {
179+ // not valid to have two dots ("..") or dot+hypen (".-")
180+ captureMatchIfValidAndReset ( ) ;
181+
182+ } else if ( domainNameCharRegex . test ( char ) ) {
183+ state = State . DomainChar ;
184+
185+ // After having read a '.' and then a valid domain character,
186+ // we now know that the domain part of the email is valid, and
187+ // we have found at least a partial EmailMatch (however, the
188+ // email address may have additional characters from this point)
189+ currentEmailAddress = new CurrentEmailAddress ( {
190+ ...currentEmailAddress ,
191+ hasDomainDot : true
192+ } ) ;
193+
194+ } else {
195+ // Anything else
196+ captureMatchIfValidAndReset ( ) ;
197+ }
198+ }
199+
200+
201+ function beginEmailAddress ( ) {
202+ state = State . LocalPart ;
203+ currentEmailAddress = new CurrentEmailAddress ( { idx : charIdx } ) ;
204+ }
205+
206+ function resetToNonEmailAddressState ( ) {
207+ state = State . NonEmailAddress ;
208+ currentEmailAddress = noCurrentEmailAddress
209+ }
210+
211+
212+ /*
213+ * Captures the current email address as an EmailMatch if it's valid,
214+ * and resets the state to read another email address.
215+ */
216+ function captureMatchIfValidAndReset ( ) {
217+ if ( currentEmailAddress . hasDomainDot ) { // we need at least one dot in the domain to be considered a valid email address
218+ let emailAddress = text . slice ( currentEmailAddress . idx , charIdx ) ;
219+
220+ // If we read a '.' or '-' char that ended the email address
221+ // (valid domain name characters, but only valid email address
222+ // characters if they are followed by something else), strip
223+ // it off now
224+ if ( / [ - . ] $ / . test ( emailAddress ) ) {
225+ emailAddress = emailAddress . slice ( 0 , - 1 ) ;
226+ }
227+
228+ matches . push ( new EmailMatch ( {
229+ tagBuilder : tagBuilder ,
230+ matchedText : emailAddress ,
231+ offset : currentEmailAddress . idx ,
232+ email : emailAddress
233+ } ) ) ;
234+ }
235+
236+ resetToNonEmailAddressState ( ) ;
237+ }
61238 }
62239
63240}
241+
242+
243+ const enum State {
244+ NonEmailAddress = 0 ,
245+ LocalPart ,
246+ LocalPartDot ,
247+ AtSign ,
248+ DomainChar ,
249+ DomainHyphen ,
250+ DomainDot
251+ }
252+
253+
254+ class CurrentEmailAddress {
255+ readonly idx : number ; // the index of the first character in the email address
256+ readonly hasDomainDot : boolean ;
257+
258+ constructor ( cfg : Partial < CurrentEmailAddress > = { } ) {
259+ this . idx = cfg . idx !== undefined ? cfg . idx : - 1 ;
260+ this . hasDomainDot = ! ! cfg . hasDomainDot ;
261+ }
262+ }
0 commit comments