Move filtering of false positives produced by the matcherRegex from Autolinker.js into a separate class, MatchValidator.js

gregjacobs · gregjacobs · commit db52e4e1f37e · 2014-11-15T16:20:52.000-05:00
diff --git a/Gruntfile.js b/Gruntfile.js
@@ -56,6 +56,7 @@ module.exports = function(grunt) {
 					'src/Util.js',
 					'src/HtmlParser.js',
 					'src/HtmlTag.js',
+					'src/MatchValidator.js',
 					'src/AnchorTagBuilder.js',
 					'src/match/Match.js',
 					'src/match/Email.js',
diff --git a/src/Autolinker.js b/src/Autolinker.js
@@ -91,6 +91,8 @@
  */
 var Autolinker = function( cfg ) {
 	Autolinker.Util.assign( this, cfg );  // assign the properties of `cfg` onto the Autolinker instance. Prototype properties will be used for missing configs.
+	
+	this.matchValidator = new Autolinker.MatchValidator();
 };
 
 
@@ -271,22 +273,6 @@ Autolinker.prototype = {
 		].join( "" ), 'gi' );
 	} )(),
 	
-	/**
-	 * @private
-	 * @property {RegExp} invalidProtocolRelMatchRegex
-	 * 
-	 * The regular expression used to check a potential protocol-relative URL match, coming from the {@link #matcherRegex}. 
-	 * A protocol-relative URL is, for example, "//yahoo.com"
-	 * 
-	 * This regular expression is used in conjunction with the {@link #matcherRegex}, and checks to see if there is a word character
-	 * before the '//' in order to determine if we should actually autolink a protocol-relative URL. This is needed because there
-	 * is no negative look-behind in JavaScript regular expressions. 
-	 * 
-	 * For instance, we want to autolink something like "//google.com", but we don't want to autolink something 
-	 * like "abc//google.com"
-	 */
-	invalidProtocolRelMatchRegex : /^[\w]\/\//,
-	
 	/**
 	 * @private
 	 * @property {RegExp} charBeforeProtocolRelMatchRegex
@@ -299,6 +285,14 @@ Autolinker.prototype = {
 	 */
 	charBeforeProtocolRelMatchRegex : /^(.)?\/\//,
 	
+	/**
+	 * @private
+	 * @property {Autolinker.MatchValidator} matchValidator
+	 * 
+	 * The MatchValidator object, used to filter out any false positives from the {@link #matcherRegex}. See
+	 * {@link Autolinker.MatchValidator} for details.
+	 */
+	
 	/**
 	 * @private
 	 * @property {Autolinker.HtmlParser} htmlParser
@@ -440,7 +434,7 @@ Autolinker.prototype = {
 		var me = this;  // for closure
 		
 		return text.replace( this.matcherRegex, function( matchStr, $1, $2, $3, $4, $5, $6, $7, $8 ) {
-			var matchDescObj = me.processCandidateMatch.apply( me, arguments );  // match description object
+			var matchDescObj = me.processCandidateMatch( matchStr, $1, $2, $3, $4, $5, $6, $7, $8 );  // match description object
 			
 			// Return out with no changes for match types that are disabled (url, email, twitter), or for matches that are 
 			// invalid (false positives from the matcherRegex, which can't use look-behinds since they are unavailable in JS).
@@ -459,7 +453,7 @@ Autolinker.prototype = {
 	/**
 	 * Processes a candidate match from the {@link #matcherRegex}. 
 	 * 
-	 * Not all matches found by the regex are actual URL/email/Twitter matches, as determined by {@link #isValidMatch}. In
+	 * Not all matches found by the regex are actual URL/email/Twitter matches, as determined by the {@link #matchValidator}. In
 	 * this case, the method returns `null`. Otherwise, a valid Object with `prefixStr`, `match`, and `suffixStr` is returned.
 	 * 
 	 * @private
@@ -502,7 +496,10 @@ Autolinker.prototype = {
 		
 		// Return out with `null` for match types that are disabled (url, email, twitter), or for matches that are 
 		// invalid (false positives from the matcherRegex, which can't use look-behinds since they are unavailable in JS).
-		if( !this.isValidMatch( twitterMatch, emailAddressMatch, urlMatch, protocolUrlMatch, protocolRelativeMatch ) ) {
+		if(
+			( twitterMatch && !this.twitter ) || ( emailAddressMatch && !this.email ) || ( urlMatch && !this.urls ) ||
+			!this.matchValidator.isValidMatch( urlMatch, protocolUrlMatch, protocolRelativeMatch ) 
+		) {
 			return null;
 		}
 		
@@ -555,47 +552,6 @@ Autolinker.prototype = {
 	},
 	
 	
-	
-	
-	/**
-	 * Determines if a given match found by {@link #processTextNode} is valid. Will return `false` for:
-	 * 
-	 * 1) Disabled link types (i.e. having a Twitter match, but {@link #twitter} matching is disabled)
-	 * 2) URL matches which do not have at least have one period ('.') in the domain name (effectively skipping over 
-	 *    matches like "abc:def")
-	 * 3) A protocol-relative url match (a URL beginning with '//') whose previous character is a word character 
-	 *    (effectively skipping over strings like "abc//google.com")
-	 * 
-	 * Otherwise, returns `true`.
-	 * 
-	 * @private
-	 * @param {String} twitterMatch The matched Twitter handle, if there was one. Will be empty string if the match is not a 
-	 *   Twitter match.
-	 * @param {String} emailAddressMatch The matched Email address, if there was one. Will be empty string if the match is not 
-	 *   an Email address match.
-	 * @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
-	 * @param {String} protocolUrlMatch The match URL string for a protocol match. Ex: 'http://yahoo.com'. This is used to match
-	 *   something like 'http://localhost', where we won't double check that the domain name has at least one '.' in it.
-	 * @param {String} protocolRelativeMatch The protocol-relative string for a URL match (i.e. '//'), possibly with a preceding
-	 *   character (ex, a space, such as: ' //', or a letter, such as: 'a//'). The match is invalid if there is a word character
-	 *   preceding the '//'.
-	 * @return {Boolean} `true` if the match given is valid and should be processed, or `false` if the match is invalid and/or 
-	 *   should just not be processed (such as, if it's a Twitter match, but {@link #twitter} matching is disabled}.
-	 */
-	isValidMatch : function( twitterMatch, emailAddressMatch, urlMatch, protocolUrlMatch, protocolRelativeMatch ) {
-		if(
-		    ( twitterMatch && !this.twitter ) || ( emailAddressMatch && !this.email ) || ( urlMatch && !this.urls ) ||
-		    ( urlMatch && ( !protocolUrlMatch || !(/:\/\//).test( protocolUrlMatch ) ) && urlMatch.indexOf( '.' ) === -1 ) ||  // At least one period ('.') must exist in the URL match for us to consider it an actual URL, *unless* it was a full protocol match (like 'http://localhost')
-		    ( urlMatch && /^[A-Za-z]{3,9}:/.test( urlMatch ) && !/:.*?[A-Za-z]/.test( urlMatch ) ) ||     // At least one letter character must exist in the domain name after a protocol match. Ex: skip over something like "git:1.0"
-		    ( protocolRelativeMatch && this.invalidProtocolRelMatchRegex.test( protocolRelativeMatch ) )  // a protocol-relative match which has a word character in front of it (so we can skip something like "abc//google.com")
-		) {
-			return false;
-		}
-		
-		return true;
-	},
-	
-	
 	/**
 	 * Determines if a match found has an unmatched closing parenthesis. If so, this parenthesis will be removed
 	 * from the match itself, and appended after the generated anchor tag in {@link #processTextNode}.
diff --git a/src/MatchValidator.js b/src/MatchValidator.js
@@ -0,0 +1,142 @@
+/*global Autolinker */
+/**
+ * @private
+ * @class Autolinker.MatchValidator
+ * @extends Object
+ * 
+ * Used by Autolinker to filter out false positives from the {@link Autolinker#matcherRegex}.
+ * 
+ * Due to the limitations of regular expressions (including the missing feature of look-behinds in JS regular expressions),
+ * we cannot always determine the validity of a given match. This class applies a bit of additional logic to filter out any
+ * false positives that have been matched by the {@link Autolinker#matcherRegex}.
+ */
+Autolinker.MatchValidator = Autolinker.Util.extend( Object, {
+	
+	/**
+	 * @private
+	 * @property {RegExp} invalidProtocolRelMatchRegex
+	 * 
+	 * The regular expression used to check a potential protocol-relative URL match, coming from the 
+	 * {@link Autolinker#matcherRegex}. A protocol-relative URL is, for example, "//yahoo.com"
+	 * 
+	 * This regular expression checks to see if there is a word character before the '//' match in order to determine if 
+	 * we should actually autolink a protocol-relative URL. This is needed because there is no negative look-behind in 
+	 * JavaScript regular expressions. 
+	 * 
+	 * For instance, we want to autolink something like "Go to: //google.com", but we don't want to autolink something 
+	 * like "abc//google.com"
+	 */
+	invalidProtocolRelMatchRegex : /^[\w]\/\//,
+	
+	/**
+	 * Regex to test for a full protocol, with the two trailing slashes. Ex: 'http://'
+	 * 
+	 * @private
+	 * @property {RegExp} hasFullProtocolRegex
+	 */
+	hasFullProtocolRegex : /^[A-Za-z]{3,9}:\/\//,
+	
+	/**
+	 * Regex to test for a protocol prefix, such as 'mailto:'
+	 * 
+	 * @private
+	 * @property {RegExp} hasProtocolPrefixRegex
+	 */
+	hasProtocolPrefixRegex : /^[A-Za-z]{3,9}:/,
+	
+	/**
+	 * Regex to determine if at least one word char exists after the protocol (i.e. after the ':')
+	 * 
+	 * @private
+	 * @property {RegExp} hasWordCharAfterProtocolRegex
+	 */
+	hasWordCharAfterProtocolRegex : /:.*?[A-Za-z]/,
+	
+	
+	/**
+	 * Determines if a given match found by {@link Autolinker#processTextNode} is valid. Will return `false` for:
+	 * 
+	 * 1) URL matches which do not have at least have one period ('.') in the domain name (effectively skipping over 
+	 *    matches like "abc:def"). However, URL matches with a protocol will be allowed (ex: 'http://localhost')
+	 * 2) URL matches which do not have at least one word character in the domain name (effectively skipping over
+	 *    matches like "git:1.0").
+	 * 3) A protocol-relative url match (a URL beginning with '//') whose previous character is a word character 
+	 *    (effectively skipping over strings like "abc//google.com")
+	 * 
+	 * Otherwise, returns `true`.
+	 * 
+	 * @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
+	 * @param {String} protocolUrlMatch The match URL string for a protocol match. Ex: 'http://yahoo.com'. This is used to match
+	 *   something like 'http://localhost', where we won't double check that the domain name has at least one '.' in it.
+	 * @param {String} protocolRelativeMatch The protocol-relative string for a URL match (i.e. '//'), possibly with a preceding
+	 *   character (ex, a space, such as: ' //', or a letter, such as: 'a//'). The match is invalid if there is a word character
+	 *   preceding the '//'.
+	 * @return {Boolean} `true` if the match given is valid and should be processed, or `false` if the match is invalid and/or 
+	 *   should just not be processed.
+	 */
+	isValidMatch : function( urlMatch, protocolUrlMatch, protocolRelativeMatch ) {
+		if(
+			this.urlMatchDoesNotHaveProtocolOrDot( urlMatch, protocolUrlMatch ) ||  // At least one period ('.') must exist in the URL match for us to consider it an actual URL, *unless* it was a full protocol match (like 'http://localhost')
+			this.urlMatchDoesNotHaveAtLeastOneWordChar( urlMatch ) ||               // At least one letter character must exist in the domain name after a protocol match. Ex: skip over something like "git:1.0"
+			this.isInvalidProtocolRelativeMatch( protocolRelativeMatch )            // A protocol-relative match which has a word character in front of it (so we can skip something like "abc//google.com")
+		) {
+			return false;
+		}
+		
+		return true;
+	},
+	
+	
+	/**
+	 * Determines if a URL match does not have either:
+	 * 
+	 * a) a full protocol (i.e. 'http://'), or
+	 * b) at least one dot ('.') in the domain name (for a non-full-protocol match).
+	 * 
+	 * Either situation is considered an invalid URL (ex: 'git:d' does not have either the '://' part, or at least one dot
+	 * in the domain name. If the match was 'git:abc.com', we would consider this valid.)
+	 * 
+	 * @private
+	 * @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
+	 * @param {String} protocolUrlMatch The match URL string for a protocol match. Ex: 'http://yahoo.com'. This is used to match
+	 *   something like 'http://localhost', where we won't double check that the domain name has at least one '.' in it.
+	 * @return {Boolean} `true` if the URL match does not have a full protocol, or at least one dot ('.') in a non-full-protocol
+	 *   match.
+	 */
+	urlMatchDoesNotHaveProtocolOrDot : function( urlMatch, protocolUrlMatch ) {
+		return ( urlMatch && ( !protocolUrlMatch || !this.hasFullProtocolRegex.test( protocolUrlMatch ) ) && urlMatch.indexOf( '.' ) === -1 );
+	},
+	
+	
+	/**
+	 * Determines if a URL match does not have at least one word character after the protocol (i.e. in the domain name).
+	 * 
+	 * At least one letter character must exist in the domain name after a protocol match. Ex: skip over something 
+	 * like "git:1.0"
+	 * 
+	 * @private
+	 * @param {String} urlMatch The matched URL, if there was one. Will be an empty string if the match is not a URL match.
+	 * @return {Boolean} `true` if the URL match does not have at least one word character in it after the protocol, `false`
+	 *   otherwise.
+	 */
+	urlMatchDoesNotHaveAtLeastOneWordChar : function( urlMatch ) {
+		return ( urlMatch && this.hasProtocolPrefixRegex.test( urlMatch ) && !this.hasWordCharAfterProtocolRegex.test( urlMatch ) );
+	},
+	
+	
+	/**
+	 * Determines if a protocol-relative match is an invalid one. This method returns `true` if there is a `protocolRelativeMatch`,
+	 * and that match contains a word character before the '//' (i.e. it must contain whitespace or nothing before the '//' in
+	 * order to be considered valid).
+	 * 
+	 * @private
+	 * @param {String} protocolRelativeMatch The protocol-relative string for a URL match (i.e. '//'), possibly with a preceding
+	 *   character (ex, a space, such as: ' //', or a letter, such as: 'a//'). The match is invalid if there is a word character
+	 *   preceding the '//'.
+	 * @return {Boolean} `true` if it is an invalid protocol-relative match, `false` otherwise.
+	 */
+	isInvalidProtocolRelativeMatch : function( protocolRelativeMatch ) {
+		return ( protocolRelativeMatch && this.invalidProtocolRelMatchRegex.test( protocolRelativeMatch ) );
+	}
+
+} );
diff --git a/tests/AutolinkerSpec.js b/tests/AutolinkerSpec.js
@@ -157,11 +157,21 @@ describe( "Autolinker", function() {
 						'DESCRIPTION:Just call this guy yeah! Testings',
 						'PRIORITY:3',
 						'END:VEVENT',
-						'END:VCALENDAR'
+						'END:VCALENDAR',
+						'START:123:SOMETHING'
 					];
+					var i, len = strings.length, str;
 					
-					for( var i = 0, len = strings.length; i < len; i++ ) {
-						expect( autolinker.link( strings[ i ] ) ).toBe( strings[ i ] );  // none should be autolinked
+					// Test with just the strings themselves.
+					for( i = 0; i < len; i++ ) {
+						str = strings[ i ];
+						expect( autolinker.link( str ) ).toBe( str );  // none should be autolinked
+					}
+					
+					// Test with the strings surrounded by other text
+					for( i = 0; i < len; i++ ) {
+						str = strings[ i ];
+						expect( autolinker.link( 'test ' + str + ' test' ) ).toBe( 'test ' + str + ' test' );  // none should be autolinked 
 					}
 				} );
 
diff --git a/tests/index.html b/tests/index.html
@@ -15,6 +15,7 @@
 	<script type="text/javascript" src="../src/Util.js"></script>
 	<script type="text/javascript" src="../src/HtmlParser.js"></script>
 	<script type="text/javascript" src="../src/HtmlTag.js"></script>
+	<script type="text/javascript" src="../src/MatchValidator.js"></script>
 	<script type="text/javascript" src="../src/AnchorTagBuilder.js"></script>
 	<script type="text/javascript" src="../src/match/Match.js"></script>
 	<script type="text/javascript" src="../src/match/Email.js"></script>