1919package name .fraser .neil .plaintext ;
2020
2121import java .io .UnsupportedEncodingException ;
22+ import java .lang .Character ;
2223import java .net .URLDecoder ;
2324import java .net .URLEncoder ;
2425import java .util .*;
@@ -1293,6 +1294,46 @@ public void diff_cleanupMerge(LinkedList<Diff> diffs) {
12931294 }
12941295 }
12951296
1297+ /**
1298+ * Rearrange diff boudnaries that split Unicode surrogate pairs.
1299+ * @param diffs Linked list of diff objects
1300+ */
1301+ public void diff_cleanupSplitSurrogates (List <Diff > diffs ) {
1302+ char lastEnd = 0 ;
1303+ boolean isFirst = true ;
1304+ HashSet <Diff > toRemove = new HashSet <Diff >();
1305+
1306+ for (Diff aDiff : diffs ) {
1307+ if (aDiff .text .isEmpty ()) {
1308+ toRemove .add (aDiff );
1309+ continue ;
1310+ }
1311+
1312+ char thisTop = aDiff .text .charAt (0 );
1313+ char thisEnd = aDiff .text .charAt (aDiff .text .length () - 1 );
1314+
1315+ if (Character .isHighSurrogate (thisEnd )) {
1316+ lastEnd = thisEnd ;
1317+ aDiff .text = aDiff .text .substring (0 , aDiff .text .length () - 1 );
1318+ }
1319+
1320+ if (!isFirst && Character .isHighSurrogate (lastEnd ) && Character .isLowSurrogate (thisTop )) {
1321+ aDiff .text = lastEnd + aDiff .text ;
1322+ }
1323+
1324+ isFirst = false ;
1325+
1326+ if ( aDiff .text .isEmpty () ) {
1327+ toRemove .add (aDiff );
1328+ continue ;
1329+ }
1330+ }
1331+
1332+ for (Diff aDiff : toRemove ) {
1333+ diffs .remove (aDiff );
1334+ }
1335+ }
1336+
12961337 /**
12971338 * loc is a location in text1, compute and return the equivalent location in
12981339 * text2.
@@ -1429,6 +1470,7 @@ public int diff_levenshtein(List<Diff> diffs) {
14291470 */
14301471 public String diff_toDelta (List <Diff > diffs ) {
14311472 StringBuilder text = new StringBuilder ();
1473+ this .diff_cleanupSplitSurrogates (diffs );
14321474 for (Diff aDiff : diffs ) {
14331475 switch (aDiff .operation ) {
14341476 case INSERT :
@@ -1457,6 +1499,103 @@ public String diff_toDelta(List<Diff> diffs) {
14571499 return delta ;
14581500 }
14591501
1502+ private int digit16 (char b ) throws IllegalArgumentException {
1503+ switch (b ) {
1504+ case '0' : return 0 ;
1505+ case '1' : return 1 ;
1506+ case '2' : return 2 ;
1507+ case '3' : return 3 ;
1508+ case '4' : return 4 ;
1509+ case '5' : return 5 ;
1510+ case '6' : return 6 ;
1511+ case '7' : return 7 ;
1512+ case '8' : return 8 ;
1513+ case '9' : return 9 ;
1514+ case 'A' : case 'a' : return 10 ;
1515+ case 'B' : case 'b' : return 11 ;
1516+ case 'C' : case 'c' : return 12 ;
1517+ case 'D' : case 'd' : return 13 ;
1518+ case 'E' : case 'e' : return 14 ;
1519+ case 'F' : case 'f' : return 15 ;
1520+ default :
1521+ throw new IllegalArgumentException ();
1522+ }
1523+ }
1524+
1525+ private String decodeURI (String text ) throws IllegalArgumentException {
1526+ int i = 0 ;
1527+ StringBuilder decoded = new StringBuilder (text .length ());
1528+
1529+ while (i < text .length ()) {
1530+ if (text .charAt (i ) != '%' ) {
1531+ decoded .append (text .charAt (i ++));
1532+ continue ;
1533+ }
1534+
1535+ // start a percent-sequence
1536+ int byte1 = (digit16 (text .charAt (i + 1 )) << 4 ) + digit16 (text .charAt (i + 2 ));
1537+ if ((byte1 & 0x80 ) == 0 ) {
1538+ decoded .append (Character .toChars (byte1 ));
1539+ i += 3 ;
1540+ continue ;
1541+ }
1542+
1543+ if ( text .charAt (i + 3 ) != '%' ) {
1544+ throw new IllegalArgumentException ();
1545+ }
1546+
1547+ int byte2 = (digit16 (text .charAt (i + 4 )) << 4 ) + digit16 (text .charAt (i + 5 ));
1548+ if ((byte2 & 0xC0 ) != 0x80 ) {
1549+ throw new IllegalArgumentException ();
1550+ }
1551+ byte2 = byte2 & 0x3F ;
1552+ if ((byte1 & 0xE0 ) == 0xC0 ) {
1553+ decoded .append (Character .toChars (((byte1 & 0x1F ) << 6 ) | byte2 ));
1554+ i += 6 ;
1555+ continue ;
1556+ }
1557+
1558+ if (text .charAt (i + 6 ) != '%' ) {
1559+ throw new IllegalArgumentException ();
1560+ }
1561+
1562+ int byte3 = (digit16 (text .charAt (i + 7 )) << 4 ) + digit16 (text .charAt (i + 8 ));
1563+ if ((byte3 & 0xC0 ) != 0x80 ) {
1564+ throw new IllegalArgumentException ();
1565+ }
1566+ byte3 = byte3 & 0x3F ;
1567+ if ((byte1 & 0xF0 ) == 0xE0 ) {
1568+ // unpaired surrogate are fine here
1569+ decoded .append (Character .toChars (((byte1 & 0x0F ) << 12 ) | (byte2 << 6 ) | byte3 ));
1570+ i += 9 ;
1571+ continue ;
1572+ }
1573+
1574+ if (text .charAt (i + 9 ) != '%' ) {
1575+ throw new IllegalArgumentException ();
1576+ }
1577+
1578+ int byte4 = (digit16 (text .charAt (i + 10 )) << 4 ) + digit16 (text .charAt (i + 11 ));
1579+ if ((byte4 & 0xC0 ) != 0x80 ) {
1580+ throw new IllegalArgumentException ();
1581+ }
1582+ byte4 = byte4 & 0x3F ;
1583+ if ((byte1 & 0xF8 ) == 0xF0 ) {
1584+ int codePoint = ((byte1 & 0x07 ) << 0x12 ) | (byte2 << 0x0C ) | (byte3 << 0x06 ) | byte4 ;
1585+ if (codePoint >= 0x010000 && codePoint <= 0x10FFFF ) {
1586+ decoded .append (Character .toChars ((codePoint & 0xFFFF ) >>> 10 & 0x3FF | 0xD800 ));
1587+ decoded .append (Character .toChars (0xDC00 | (codePoint & 0xFFFF ) & 0x3FF ));
1588+ i += 12 ;
1589+ continue ;
1590+ }
1591+ }
1592+
1593+ throw new IllegalArgumentException ();
1594+ }
1595+
1596+ return decoded .toString ();
1597+ }
1598+
14601599 /**
14611600 * Given the original text1, and an encoded string which describes the
14621601 * operations required to transform text1 into text2, compute the full diff.
@@ -1483,10 +1622,7 @@ public LinkedList<Diff> diff_fromDelta(String text1, String delta)
14831622 // decode would change all "+" to " "
14841623 param = param .replace ("+" , "%2B" );
14851624 try {
1486- param = URLDecoder .decode (param , "UTF-8" );
1487- } catch (UnsupportedEncodingException e ) {
1488- // Not likely on modern system.
1489- throw new Error ("This system does not support UTF-8." , e );
1625+ param = this .decodeURI (param );
14901626 } catch (IllegalArgumentException e ) {
14911627 // Malformed URI sequence.
14921628 throw new IllegalArgumentException (
@@ -2269,10 +2405,7 @@ public List<Patch> patch_fromText(String textline)
22692405 line = text .getFirst ().substring (1 );
22702406 line = line .replace ("+" , "%2B" ); // decode would change all "+" to " "
22712407 try {
2272- line = URLDecoder .decode (line , "UTF-8" );
2273- } catch (UnsupportedEncodingException e ) {
2274- // Not likely on modern system.
2275- throw new Error ("This system does not support UTF-8." , e );
2408+ line = this .decodeURI (line );
22762409 } catch (IllegalArgumentException e ) {
22772410 // Malformed URI sequence.
22782411 throw new IllegalArgumentException (
0 commit comments