Skip to content

Commit 6438564

Browse files
authored
Merge pull request #201 from closeio/idn-tld
Support IDN and punicode TLDs
2 parents 5fe9118 + bb914e4 commit 6438564

File tree

8 files changed

+113
-24
lines changed

8 files changed

+113
-24
lines changed

gulpfile.js

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,21 +2,25 @@
22
const clone = require( 'gulp-clone' ),
33
concat = require( 'gulp-concat' ),
44
connect = require( 'gulp-connect' ),
5+
download = require( 'gulp-download' ),
56
gulp = require( 'gulp' ),
67
header = require( 'gulp-header' ),
78
jasmine = require( 'gulp-jasmine' ),
89
jshint = require( 'gulp-jshint' ),
910
merge = require( 'merge-stream' ),
1011
preprocess = require( 'gulp-preprocess' ),
12+
punycode = require( 'punycode' ),
1113
rename = require( 'gulp-rename' ),
1214
through2 = require( 'through2' ),
15+
transform = require( 'gulp-transform' ),
1316
typescript = require( 'gulp-typescript' ),
1417
uglify = require( 'gulp-uglify' ),
1518
umd = require( 'gulp-umd' ),
1619
JsDuck = require( 'gulp-jsduck' ),
1720
KarmaServer = require( 'karma' ).Server;
1821

1922

23+
2024
// Project configuration
2125
const pkg = require( './package.json' ),
2226
banner = createBanner(),
@@ -36,6 +40,7 @@ gulp.task( 'test', [ 'build' ], testTask );
3640
gulp.task( 'doc', [ 'build', 'typescript' ], docTask );
3741
gulp.task( 'serve', [ 'typescript', 'doc' ], serveTask );
3842
gulp.task( 'typescript', typescriptTask ); // for examples
43+
gulp.task( 'update-tld-list', updateTldRegex );
3944

4045

4146
function buildTask() {
@@ -172,6 +177,7 @@ function createSrcFilesList() {
172177
'src/match/Phone.js',
173178
'src/match/Mention.js',
174179
'src/match/Url.js',
180+
'src/matcher/TldRegex.js',
175181
'src/matcher/Matcher.js',
176182
'src/matcher/Email.js',
177183
'src/matcher/Hashtag.js',
@@ -184,3 +190,49 @@ function createSrcFilesList() {
184190
'src/truncate/TruncateSmart.js'
185191
];
186192
}
193+
194+
function dePunycodeDomain(d){
195+
d = d.toLowerCase();
196+
if (/xn--/.test(d)){
197+
return [d, punycode.toUnicode(d)];
198+
}
199+
return [d];
200+
}
201+
202+
function notCommentLine(line){
203+
return !/^#/.test(line);
204+
}
205+
206+
function compareLengthLongestFirst(a, b){
207+
var result = b.length - a.length
208+
if (result == 0) {
209+
result = a.localeCompare(b)
210+
}
211+
return result;
212+
}
213+
214+
function domainsToRegex(contents){
215+
contents = contents
216+
.split('\n')
217+
.filter(notCommentLine)
218+
.map(dePunycodeDomain);
219+
contents = [].concat.apply([], contents);
220+
contents = contents.filter(function(s){ return !!s });
221+
contents.sort(compareLengthLongestFirst);
222+
contents = contents.join('|');
223+
contents = '/*global Autolinker */\nAutolinker.tldRegex = /(?:' + contents + ')/;\n';
224+
225+
return contents;
226+
}
227+
228+
function updateTldRegex(){
229+
return download('http://data.iana.org/TLD/tlds-alpha-by-domain.txt')
230+
.pipe(transform(domainsToRegex, { encoding: 'utf8' }))
231+
.pipe( header( '// NOTE: THIS IS A GENERATED FILE\n// To update with the latest TLD list, run `gulp update-tld-list`\n\n' ) )
232+
.pipe(rename(function(path){
233+
path.basename = "TldRegex";
234+
path.extname = '.js';
235+
}))
236+
.pipe(gulp.dest('./src/matcher/'));
237+
}
238+

package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,14 @@
3535
"gulp-clone": "^1.0.0",
3636
"gulp-concat": "^2.6.0",
3737
"gulp-connect": "^3.0.0",
38+
"gulp-download": "^0.0.1",
3839
"gulp-header": "^1.7.1",
3940
"gulp-jasmine": "^2.3.0",
4041
"gulp-jsduck": "^0.3.0",
4142
"gulp-jshint": "^2.0.0",
4243
"gulp-preprocess": "^2.0.0",
4344
"gulp-rename": "^1.2.2",
45+
"gulp-transform": "^1.1.0",
4446
"gulp-typescript": "^2.13.0",
4547
"gulp-uglify": "^1.5.3",
4648
"gulp-umd": "^0.2.0",
@@ -54,6 +56,7 @@
5456
"lodash": "^4.3.0",
5557
"merge-stream": "^1.0.0",
5658
"phantomjs-prebuilt": "^2.1.4",
59+
"punycode": "^2.1.0",
5760
"requirejs": "^2.1.11",
5861
"through2": "^2.0.1"
5962
}

src/RegexLib.js

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,6 @@ Autolinker.RegexLib = (function() {
5252
// See documentation below
5353
var domainNameRegex = new RegExp( '[' + alphaNumericCharsStr + '.\\-]*[' + alphaNumericCharsStr + '\\-]' );
5454

55-
56-
// See documentation below
57-
var tldRegex = /(?:travelersinsurance|sandvikcoromant|kerryproperties|cancerresearch|weatherchannel|kerrylogistics|spreadbetting|international|wolterskluwer|lifeinsurance|construction|pamperedchef|scholarships|versicherung|bridgestone|creditunion|kerryhotels|investments|productions|blackfriday|enterprises|lamborghini|photography|motorcycles|williamhill|playstation|contractors|barclaycard|accountants|redumbrella|engineering|management|telefonica|protection|consulting|tatamotors|creditcard|vlaanderen|schaeffler|associates|properties|foundation|republican|bnpparibas|boehringer|eurovision|extraspace|industries|immobilien|university|technology|volkswagen|healthcare|restaurant|cuisinella|vistaprint|apartments|accountant|travelers|homedepot|institute|vacations|furniture|fresenius|insurance|christmas|bloomberg|solutions|barcelona|firestone|financial|kuokgroup|fairwinds|community|passagens|goldpoint|equipment|lifestyle|yodobashi|aquarelle|marketing|analytics|education|amsterdam|statefarm|melbourne|allfinanz|directory|microsoft|stockholm|montblanc|accenture|lancaster|landrover|everbank|istanbul|graphics|grainger|ipiranga|softbank|attorney|pharmacy|saarland|catering|airforce|yokohama|mortgage|frontier|mutuelle|stcgroup|memorial|pictures|football|symantec|cipriani|ventures|telecity|cityeats|verisign|flsmidth|boutique|cleaning|firmdale|clinique|clothing|redstone|infiniti|deloitte|feedback|services|broadway|plumbing|commbank|training|barclays|exchange|computer|brussels|software|delivery|barefoot|builders|business|bargains|engineer|holdings|download|security|helsinki|lighting|movistar|discount|hdfcbank|supplies|marriott|property|diamonds|capetown|partners|democrat|jpmorgan|bradesco|budapest|rexroth|zuerich|shriram|academy|science|support|youtube|singles|surgery|alibaba|statoil|dentist|schwarz|android|cruises|cricket|digital|markets|starhub|systems|courses|coupons|netbank|country|domains|corsica|network|neustar|realtor|lincoln|limited|schmidt|yamaxun|cooking|contact|auction|spiegel|liaison|leclerc|latrobe|lasalle|abogado|compare|lanxess|exposed|express|company|cologne|college|avianca|lacaixa|fashion|recipes|ferrero|komatsu|storage|wanggou|clubmed|sandvik|fishing|fitness|bauhaus|kitchen|flights|florist|flowers|watches|weather|temasek|samsung|bentley|forsale|channel|theater|frogans|theatre|okinawa|website|tickets|jewelry|gallery|tiffany|iselect|shiksha|brother|organic|wedding|genting|toshiba|origins|philips|hyundai|hotmail|hoteles|hosting|rentals|windows|cartier|bugatti|holiday|careers|whoswho|hitachi|panerai|caravan|reviews|guitars|capital|trading|hamburg|hangout|finance|stream|family|abbott|health|review|travel|report|hermes|hiphop|gratis|career|toyota|hockey|dating|repair|google|social|soccer|reisen|global|otsuka|giving|unicom|casino|photos|center|broker|rocher|orange|bostik|garden|insure|ryukyu|bharti|safety|physio|sakura|oracle|online|jaguar|gallup|piaget|tienda|futbol|pictet|joburg|webcam|berlin|office|juegos|kaufen|chanel|chrome|xihuan|church|tennis|circle|kinder|flickr|bayern|claims|clinic|viajes|nowruz|xperia|norton|yachts|studio|coffee|camera|sanofi|nissan|author|expert|events|comsec|lawyer|tattoo|viking|estate|villas|condos|realty|yandex|energy|emerck|virgin|vision|durban|living|school|coupon|london|taobao|natura|taipei|nagoya|luxury|walter|aramco|sydney|madrid|credit|maison|makeup|schule|market|anquan|direct|design|swatch|suzuki|alsace|vuelos|dental|alipay|voyage|shouji|voting|airtel|mutual|degree|supply|agency|museum|mobily|dealer|monash|select|mormon|active|moscow|racing|datsun|quebec|nissay|rodeo|email|gifts|works|photo|chloe|edeka|cheap|earth|vista|tushu|koeln|glass|shoes|globo|tunes|gmail|nokia|space|kyoto|black|ricoh|seven|lamer|sener|epson|cisco|praxi|trust|citic|crown|shell|lease|green|legal|lexus|ninja|tatar|gripe|nikon|group|video|wales|autos|gucci|party|nexus|guide|linde|adult|parts|amica|lixil|boats|azure|loans|locus|cymru|lotte|lotto|stada|click|poker|quest|dabur|lupin|nadex|paris|faith|dance|canon|place|gives|trade|skype|rocks|mango|cloud|boots|smile|final|swiss|homes|honda|media|horse|cards|deals|watch|bosch|house|pizza|miami|osaka|tours|total|xerox|coach|sucks|style|delta|toray|iinet|tools|money|codes|beats|tokyo|salon|archi|movie|baidu|study|actor|yahoo|store|apple|world|forex|today|bible|tmall|tirol|irish|tires|forum|reise|vegas|vodka|sharp|omega|weber|jetzt|audio|promo|build|bingo|chase|gallo|drive|dubai|rehab|press|solar|sale|beer|bbva|bank|band|auto|sapo|sarl|saxo|audi|asia|arte|arpa|army|yoga|ally|zara|scor|scot|sexy|seat|zero|seek|aero|adac|zone|aarp|maif|meet|meme|menu|surf|mini|mobi|mtpc|porn|desi|star|ltda|name|talk|navy|love|loan|live|link|news|limo|like|spot|life|nico|lidl|lgbt|land|taxi|team|tech|kred|kpmg|sony|song|kiwi|kddi|jprs|jobs|sohu|java|itau|tips|info|immo|icbc|hsbc|town|host|page|toys|here|help|pars|haus|guru|guge|tube|goog|golf|gold|sncf|gmbh|gift|ggee|gent|gbiz|game|vana|pics|fund|ford|ping|pink|fish|film|fast|farm|play|fans|fail|plus|skin|pohl|fage|moda|post|erni|dvag|prod|doha|prof|docs|viva|diet|luxe|site|dell|sina|dclk|show|qpon|date|vote|cyou|voto|read|coop|cool|wang|club|city|chat|cern|cash|reit|rent|casa|cars|care|camp|rest|call|cafe|weir|wien|rich|wiki|buzz|wine|book|bond|room|work|rsvp|shia|ruhr|blue|bing|shaw|bike|safe|xbox|best|pwc|mtn|lds|aig|boo|fyi|nra|nrw|ntt|car|gal|obi|zip|aeg|vin|how|one|ong|onl|dad|ooo|bet|esq|org|htc|bar|uol|ibm|ovh|gdn|ice|icu|uno|gea|ifm|bot|top|wtf|lol|day|pet|eus|wtc|ubs|tvs|aco|ing|ltd|ink|tab|abb|afl|cat|int|pid|pin|bid|cba|gle|com|cbn|ads|man|wed|ceb|gmo|sky|ist|gmx|tui|mba|fan|ski|iwc|app|pro|med|ceo|jcb|jcp|goo|dev|men|aaa|meo|pub|jlc|bom|jll|gop|jmp|mil|got|gov|win|jot|mma|joy|trv|red|cfa|cfd|bio|moe|moi|mom|ren|biz|aws|xin|bbc|dnp|buy|kfh|mov|thd|xyz|fit|kia|rio|rip|kim|dog|vet|nyc|bcg|mtr|bcn|bms|bmw|run|bzh|rwe|tel|stc|axa|kpn|fly|krd|cab|bnl|foo|crs|eat|tci|sap|srl|nec|sas|net|cal|sbs|sfr|sca|scb|csc|edu|new|xxx|hiv|fox|wme|ngo|nhk|vip|sex|frl|lat|yun|law|you|tax|soy|sew|om|ac|hu|se|sc|sg|sh|sb|sa|rw|ru|rs|ro|re|qa|py|si|pw|pt|ps|sj|sk|pr|pn|pm|pl|sl|sm|pk|sn|ph|so|pg|pf|pe|pa|zw|nz|nu|nr|np|no|nl|ni|ng|nf|sr|ne|st|nc|na|mz|my|mx|mw|mv|mu|mt|ms|mr|mq|mp|mo|su|mn|mm|ml|mk|mh|mg|me|sv|md|mc|sx|sy|ma|ly|lv|sz|lu|lt|ls|lr|lk|li|lc|lb|la|tc|kz|td|ky|kw|kr|kp|kn|km|ki|kh|tf|tg|th|kg|ke|jp|jo|jm|je|it|is|ir|tj|tk|tl|tm|iq|tn|to|io|in|im|il|ie|ad|sd|ht|hr|hn|hm|tr|hk|gy|gw|gu|gt|gs|gr|gq|tt|gp|gn|gm|gl|tv|gi|tw|tz|ua|gh|ug|uk|gg|gf|ge|gd|us|uy|uz|va|gb|ga|vc|ve|fr|fo|fm|fk|fj|vg|vi|fi|eu|et|es|er|eg|ee|ec|dz|do|dm|dk|vn|dj|de|cz|cy|cx|cw|vu|cv|cu|cr|co|cn|cm|cl|ck|ci|ch|cg|cf|cd|cc|ca|wf|bz|by|bw|bv|bt|bs|br|bo|bn|bm|bj|bi|ws|bh|bg|bf|be|bd|bb|ba|az|ax|aw|au|at|as|ye|ar|aq|ao|am|al|yt|ai|za|ag|af|ae|zm|id)\b/;
58-
59-
6055
return {
6156

6257
/**
@@ -72,22 +67,26 @@ Autolinker.RegexLib = (function() {
7267
alphaNumericCharsStr : alphaNumericCharsStr,
7368

7469
/**
75-
* A regular expression to match domain names of a URL or email address.
76-
* Ex: 'google', 'yahoo', 'some-other-company', etc.
70+
* The string form of a regular expression that would match all of the
71+
* letters and in the unicode character set when placed
72+
* in a RegExp character class (`[]`).
7773
*
78-
* @property {RegExp} domainNameRegex
74+
* These would be the characters matched by unicode regex engines `[\p{L}]`
75+
* escape ("all letters")
76+
*
77+
* @property {String} alphaCharsStr
7978
*/
80-
domainNameRegex : domainNameRegex,
79+
alphaCharsStr : alphaCharsStr,
8180

8281
/**
83-
* A regular expression to match top level domains (TLDs) for a URL or
84-
* email address. Ex: 'com', 'org', 'net', etc.
82+
* A regular expression to match domain names of a URL or email address.
83+
* Ex: 'google', 'yahoo', 'some-other-company', etc.
8584
*
86-
* @property {RegExp} tldRegex
85+
* @property {RegExp} domainNameRegex
8786
*/
88-
tldRegex : tldRegex
87+
domainNameRegex : domainNameRegex,
8988

9089
};
9190

9291

93-
}() );
92+
}() );

src/matcher/Email.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ Autolinker.matcher.Email = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
2121
var alphaNumericChars = Autolinker.RegexLib.alphaNumericCharsStr,
2222
emailRegex = new RegExp( '[' + alphaNumericChars + '\\-_\';:&=+$.,]+@' ), // something@ for email addresses (a.k.a. local-part)
2323
domainNameRegex = Autolinker.RegexLib.domainNameRegex,
24-
tldRegex = Autolinker.RegexLib.tldRegex; // match our known top level domains (TLDs)
24+
tldRegex = Autolinker.tldRegex; // match our known top level domains (TLDs)
2525

2626
return new RegExp( [
2727
emailRegex.source,
@@ -54,4 +54,4 @@ Autolinker.matcher.Email = Autolinker.Util.extend( Autolinker.matcher.Matcher, {
5454
return matches;
5555
}
5656

57-
} );
57+
} );

0 commit comments

Comments
 (0)