Skip to content

Commit db52e4e

Browse files
committed
Move filtering of false positives produced by the matcherRegex from Autolinker.js into a separate class, MatchValidator.js
1 parent 0a321af commit db52e4e

File tree

5 files changed

+173
-63
lines changed

5 files changed

+173
-63
lines changed

Gruntfile.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ module.exports = function(grunt) {
5656
'src/Util.js',
5757
'src/HtmlParser.js',
5858
'src/HtmlTag.js',
59+
'src/MatchValidator.js',
5960
'src/AnchorTagBuilder.js',
6061
'src/match/Match.js',
6162
'src/match/Email.js',

src/Autolinker.js

Lines changed: 16 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,8 @@
9191
*/
9292
var Autolinker = function( cfg ) {
9393
Autolinker.Util.assign( this, cfg ); // assign the properties of `cfg` onto the Autolinker instance. Prototype properties will be used for missing configs.
94+
95+
this.matchValidator = new Autolinker.MatchValidator();
9496
};
9597

9698

@@ -271,22 +273,6 @@ Autolinker.prototype = {
271273
].join( "" ), 'gi' );
272274
} )(),
273275

274-
/**
275-
* @private
276-
* @property {RegExp} invalidProtocolRelMatchRegex
277-
*
278-
* The regular expression used to check a potential protocol-relative URL match, coming from the {@link #matcherRegex}.
279-
* A protocol-relative URL is, for example, "//yahoo.com"
280-
*
281-
* This regular expression is used in conjunction with the {@link #matcherRegex}, and checks to see if there is a word character
282-
* before the '//' in order to determine if we should actually autolink a protocol-relative URL. This is needed because there
283-
* is no negative look-behind in JavaScript regular expressions.
284-
*
285-
* For instance, we want to autolink something like "//google.com", but we don't want to autolink something
286-
* like "abc//google.com"
287-
*/
288-
invalidProtocolRelMatchRegex : /^[\w]\/\//,
289-
290276
/**
291277
* @private
292278
* @property {RegExp} charBeforeProtocolRelMatchRegex
@@ -299,6 +285,14 @@ Autolinker.prototype = {
299285
*/
300286
charBeforeProtocolRelMatchRegex : /^(.)?\/\//,
301287

288+
/**
289+
* @private
290+
* @property {Autolinker.MatchValidator} matchValidator
291+
*
292+
* The MatchValidator object, used to filter out any false positives from the {@link #matcherRegex}. See
293+
* {@link Autolinker.MatchValidator} for details.
294+
*/
295+
302296
/**
303297
* @private
304298
* @property {Autolinker.HtmlParser} htmlParser
@@ -440,7 +434,7 @@ Autolinker.prototype = {
440434
var me = this; // for closure
441435

442436
return text.replace( this.matcherRegex, function( matchStr, $1, $2, $3, $4, $5, $6, $7, $8 ) {
443-
var matchDescObj = me.processCandidateMatch.apply( me, arguments ); // match description object
437+
var matchDescObj = me.processCandidateMatch( matchStr, $1, $2, $3, $4, $5, $6, $7, $8 ); // match description object
444438

445439
// Return out with no changes for match types that are disabled (url, email, twitter), or for matches that are
446440
// invalid (false positives from the matcherRegex, which can't use look-behinds since they are unavailable in JS).
@@ -459,7 +453,7 @@ Autolinker.prototype = {
459453
/**
460454
* Processes a candidate match from the {@link #matcherRegex}.
461455
*
462-
* Not all matches found by the regex are actual URL/email/Twitter matches, as determined by {@link #isValidMatch}. In
456+
* Not all matches found by the regex are actual URL/email/Twitter matches, as determined by the {@link #matchValidator}. In
463457
* this case, the method returns `null`. Otherwise, a valid Object with `prefixStr`, `match`, and `suffixStr` is returned.
464458
*
465459
* @private
@@ -502,7 +496,10 @@ Autolinker.prototype = {
502496

503497
// Return out with `null` for match types that are disabled (url, email, twitter), or for matches that are
504498
// invalid (false positives from the matcherRegex, which can't use look-behinds since they are unavailable in JS).
505-
if( !this.isValidMatch( twitterMatch, emailAddressMatch, urlMatch, protocolUrlMatch, protocolRelativeMatch ) ) {
499+
if(
500+
( twitterMatch && !this.twitter ) || ( emailAddressMatch && !this.email ) || ( urlMatch && !this.urls ) ||
501+
!this.matchValidator.isValidMatch( urlMatch, protocolUrlMatch, protocolRelativeMatch )
502+
) {
506503
return null;
507504
}
508505

@@ -555,47 +552,6 @@ Autolinker.prototype = {
555552
},
556553

557554

558-
559-
560-
/**
561-
* Determines if a given match found by {@link #processTextNode} is valid. Will return `false` for:
562-
*
563-
* 1) Disabled link types (i.e. having a Twitter match, but {@link #twitter} matching is disabled)
564-
* 2) URL matches which do not have at least have one period ('.') in the domain name (effectively skipping over
565-
* matches like "abc:def")
566-
* 3) A protocol-relative url match (a URL beginning with '//') whose previous character is a word character
567-
* (effectively skipping over strings like "abc//google.com")
568-
*
569-
* Otherwise, returns `true`.
570-
*
571-
* @private
572-
* @param {String} twitterMatch The matched Twitter handle, if there was one. Will be empty string if the match is not a
573-
* Twitter match.
574-
* @param {String} emailAddressMatch The matched Email address, if there was one. Will be empty string if the match is not
575-
* an Email address match.
576-
* @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
577-
* @param {String} protocolUrlMatch The match URL string for a protocol match. Ex: 'http://yahoo.com'. This is used to match
578-
* something like 'http://localhost', where we won't double check that the domain name has at least one '.' in it.
579-
* @param {String} protocolRelativeMatch The protocol-relative string for a URL match (i.e. '//'), possibly with a preceding
580-
* character (ex, a space, such as: ' //', or a letter, such as: 'a//'). The match is invalid if there is a word character
581-
* preceding the '//'.
582-
* @return {Boolean} `true` if the match given is valid and should be processed, or `false` if the match is invalid and/or
583-
* should just not be processed (such as, if it's a Twitter match, but {@link #twitter} matching is disabled}.
584-
*/
585-
isValidMatch : function( twitterMatch, emailAddressMatch, urlMatch, protocolUrlMatch, protocolRelativeMatch ) {
586-
if(
587-
( twitterMatch && !this.twitter ) || ( emailAddressMatch && !this.email ) || ( urlMatch && !this.urls ) ||
588-
( urlMatch && ( !protocolUrlMatch || !(/:\/\//).test( protocolUrlMatch ) ) && urlMatch.indexOf( '.' ) === -1 ) || // At least one period ('.') must exist in the URL match for us to consider it an actual URL, *unless* it was a full protocol match (like 'http://localhost')
589-
( urlMatch && /^[A-Za-z]{3,9}:/.test( urlMatch ) && !/:.*?[A-Za-z]/.test( urlMatch ) ) || // At least one letter character must exist in the domain name after a protocol match. Ex: skip over something like "git:1.0"
590-
( protocolRelativeMatch && this.invalidProtocolRelMatchRegex.test( protocolRelativeMatch ) ) // a protocol-relative match which has a word character in front of it (so we can skip something like "abc//google.com")
591-
) {
592-
return false;
593-
}
594-
595-
return true;
596-
},
597-
598-
599555
/**
600556
* Determines if a match found has an unmatched closing parenthesis. If so, this parenthesis will be removed
601557
* from the match itself, and appended after the generated anchor tag in {@link #processTextNode}.

src/MatchValidator.js

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
/*global Autolinker */
2+
/**
3+
* @private
4+
* @class Autolinker.MatchValidator
5+
* @extends Object
6+
*
7+
* Used by Autolinker to filter out false positives from the {@link Autolinker#matcherRegex}.
8+
*
9+
* Due to the limitations of regular expressions (including the missing feature of look-behinds in JS regular expressions),
10+
* we cannot always determine the validity of a given match. This class applies a bit of additional logic to filter out any
11+
* false positives that have been matched by the {@link Autolinker#matcherRegex}.
12+
*/
13+
Autolinker.MatchValidator = Autolinker.Util.extend( Object, {
14+
15+
/**
16+
* @private
17+
* @property {RegExp} invalidProtocolRelMatchRegex
18+
*
19+
* The regular expression used to check a potential protocol-relative URL match, coming from the
20+
* {@link Autolinker#matcherRegex}. A protocol-relative URL is, for example, "//yahoo.com"
21+
*
22+
* This regular expression checks to see if there is a word character before the '//' match in order to determine if
23+
* we should actually autolink a protocol-relative URL. This is needed because there is no negative look-behind in
24+
* JavaScript regular expressions.
25+
*
26+
* For instance, we want to autolink something like "Go to: //google.com", but we don't want to autolink something
27+
* like "abc//google.com"
28+
*/
29+
invalidProtocolRelMatchRegex : /^[\w]\/\//,
30+
31+
/**
32+
* Regex to test for a full protocol, with the two trailing slashes. Ex: 'http://'
33+
*
34+
* @private
35+
* @property {RegExp} hasFullProtocolRegex
36+
*/
37+
hasFullProtocolRegex : /^[A-Za-z]{3,9}:\/\//,
38+
39+
/**
40+
* Regex to test for a protocol prefix, such as 'mailto:'
41+
*
42+
* @private
43+
* @property {RegExp} hasProtocolPrefixRegex
44+
*/
45+
hasProtocolPrefixRegex : /^[A-Za-z]{3,9}:/,
46+
47+
/**
48+
* Regex to determine if at least one word char exists after the protocol (i.e. after the ':')
49+
*
50+
* @private
51+
* @property {RegExp} hasWordCharAfterProtocolRegex
52+
*/
53+
hasWordCharAfterProtocolRegex : /:.*?[A-Za-z]/,
54+
55+
56+
/**
57+
* Determines if a given match found by {@link Autolinker#processTextNode} is valid. Will return `false` for:
58+
*
59+
* 1) URL matches which do not have at least have one period ('.') in the domain name (effectively skipping over
60+
* matches like "abc:def"). However, URL matches with a protocol will be allowed (ex: 'http://localhost')
61+
* 2) URL matches which do not have at least one word character in the domain name (effectively skipping over
62+
* matches like "git:1.0").
63+
* 3) A protocol-relative url match (a URL beginning with '//') whose previous character is a word character
64+
* (effectively skipping over strings like "abc//google.com")
65+
*
66+
* Otherwise, returns `true`.
67+
*
68+
* @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
69+
* @param {String} protocolUrlMatch The match URL string for a protocol match. Ex: 'http://yahoo.com'. This is used to match
70+
* something like 'http://localhost', where we won't double check that the domain name has at least one '.' in it.
71+
* @param {String} protocolRelativeMatch The protocol-relative string for a URL match (i.e. '//'), possibly with a preceding
72+
* character (ex, a space, such as: ' //', or a letter, such as: 'a//'). The match is invalid if there is a word character
73+
* preceding the '//'.
74+
* @return {Boolean} `true` if the match given is valid and should be processed, or `false` if the match is invalid and/or
75+
* should just not be processed.
76+
*/
77+
isValidMatch : function( urlMatch, protocolUrlMatch, protocolRelativeMatch ) {
78+
if(
79+
this.urlMatchDoesNotHaveProtocolOrDot( urlMatch, protocolUrlMatch ) || // At least one period ('.') must exist in the URL match for us to consider it an actual URL, *unless* it was a full protocol match (like 'http://localhost')
80+
this.urlMatchDoesNotHaveAtLeastOneWordChar( urlMatch ) || // At least one letter character must exist in the domain name after a protocol match. Ex: skip over something like "git:1.0"
81+
this.isInvalidProtocolRelativeMatch( protocolRelativeMatch ) // A protocol-relative match which has a word character in front of it (so we can skip something like "abc//google.com")
82+
) {
83+
return false;
84+
}
85+
86+
return true;
87+
},
88+
89+
90+
/**
91+
* Determines if a URL match does not have either:
92+
*
93+
* a) a full protocol (i.e. 'http://'), or
94+
* b) at least one dot ('.') in the domain name (for a non-full-protocol match).
95+
*
96+
* Either situation is considered an invalid URL (ex: 'git:d' does not have either the '://' part, or at least one dot
97+
* in the domain name. If the match was 'git:abc.com', we would consider this valid.)
98+
*
99+
* @private
100+
* @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
101+
* @param {String} protocolUrlMatch The match URL string for a protocol match. Ex: 'http://yahoo.com'. This is used to match
102+
* something like 'http://localhost', where we won't double check that the domain name has at least one '.' in it.
103+
* @return {Boolean} `true` if the URL match does not have a full protocol, or at least one dot ('.') in a non-full-protocol
104+
* match.
105+
*/
106+
urlMatchDoesNotHaveProtocolOrDot : function( urlMatch, protocolUrlMatch ) {
107+
return ( urlMatch && ( !protocolUrlMatch || !this.hasFullProtocolRegex.test( protocolUrlMatch ) ) && urlMatch.indexOf( '.' ) === -1 );
108+
},
109+
110+
111+
/**
112+
* Determines if a URL match does not have at least one word character after the protocol (i.e. in the domain name).
113+
*
114+
* At least one letter character must exist in the domain name after a protocol match. Ex: skip over something
115+
* like "git:1.0"
116+
*
117+
* @private
118+
* @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
119+
* @return {Boolean} `true` if the URL match does not have at least one word character in it after the protocol, `false`
120+
* otherwise.
121+
*/
122+
urlMatchDoesNotHaveAtLeastOneWordChar : function( urlMatch ) {
123+
return ( urlMatch && this.hasProtocolPrefixRegex.test( urlMatch ) && !this.hasWordCharAfterProtocolRegex.test( urlMatch ) );
124+
},
125+
126+
127+
/**
128+
* Determines if a protocol-relative match is an invalid one. This method returns `true` if there is a `protocolRelativeMatch`,
129+
* and that match contains a word character before the '//' (i.e. it must contain whitespace or nothing before the '//' in
130+
* order to be considered valid).
131+
*
132+
* @private
133+
* @param {String} protocolRelativeMatch The protocol-relative string for a URL match (i.e. '//'), possibly with a preceding
134+
* character (ex, a space, such as: ' //', or a letter, such as: 'a//'). The match is invalid if there is a word character
135+
* preceding the '//'.
136+
* @return {Boolean} `true` if it is an invalid protocol-relative match, `false` otherwise.
137+
*/
138+
isInvalidProtocolRelativeMatch : function( protocolRelativeMatch ) {
139+
return ( protocolRelativeMatch && this.invalidProtocolRelMatchRegex.test( protocolRelativeMatch ) );
140+
}
141+
142+
} );

tests/AutolinkerSpec.js

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -157,11 +157,21 @@ describe( "Autolinker", function() {
157157
'DESCRIPTION:Just call this guy yeah! Testings',
158158
'PRIORITY:3',
159159
'END:VEVENT',
160-
'END:VCALENDAR'
160+
'END:VCALENDAR',
161+
'START:123:SOMETHING'
161162
];
163+
var i, len = strings.length, str;
162164

163-
for( var i = 0, len = strings.length; i < len; i++ ) {
164-
expect( autolinker.link( strings[ i ] ) ).toBe( strings[ i ] ); // none should be autolinked
165+
// Test with just the strings themselves.
166+
for( i = 0; i < len; i++ ) {
167+
str = strings[ i ];
168+
expect( autolinker.link( str ) ).toBe( str ); // none should be autolinked
169+
}
170+
171+
// Test with the strings surrounded by other text
172+
for( i = 0; i < len; i++ ) {
173+
str = strings[ i ];
174+
expect( autolinker.link( 'test ' + str + ' test' ) ).toBe( 'test ' + str + ' test' ); // none should be autolinked
165175
}
166176
} );
167177

tests/index.html

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
<script type="text/javascript" src="../src/Util.js"></script>
1616
<script type="text/javascript" src="../src/HtmlParser.js"></script>
1717
<script type="text/javascript" src="../src/HtmlTag.js"></script>
18+
<script type="text/javascript" src="../src/MatchValidator.js"></script>
1819
<script type="text/javascript" src="../src/AnchorTagBuilder.js"></script>
1920
<script type="text/javascript" src="../src/match/Match.js"></script>
2021
<script type="text/javascript" src="../src/match/Email.js"></script>

0 commit comments

Comments
 (0)