@@ -1363,44 +1363,44 @@ func TestDecodeStringLatin1(t *testing.T) {
13631363 wantRead : 6 ,
13641364 },
13651365 {
1366- name : "Short Latin1 string with ‘" ,
1367- input : append (
1368- append ([]byte {7 }, []byte {0xe2 , 'f' , 'h' , 0xe9 }... ),
1369- []byte ("‘" )... ),
1370- length : 5 ,
1371- wantStr : "âfhé'" ,
1366+ name : "Short Latin1 string with UTF-8 smart quote '" ,
1367+ input : append ([]byte {7 }, append (
1368+ []byte {0xe2 , 'f' , 'h' , 0xe9 },
1369+ []byte {0xE2 , 0x80 , 0x99 }... )... ), // UTF-8 ' (3 bytes)
1370+ length : 255 ,
13721371 charset : "latin1" ,
1373- wantRead : 6 ,
1372+ wantStr : "âfhéâ " , // UTF-8 smart quote decoded as Latin-1 becomes "â "
1373+ wantRead : 8 ,
13741374 },
13751375 {
1376- name : "Short Latin1 string with ’" ,
1377- input : append (
1378- append ([]byte {7 }, []byte {0xe2 , 'f' , 'h' , 0xe9 }... ),
1379- []byte ("’" )... ),
1380- length : 5 ,
1381- wantStr : "âfhé'" ,
1376+ name : "Short Latin1 string with UTF-8 smart quote '" ,
1377+ input : append ([]byte {7 }, append (
1378+ []byte {0xe2 , 'f' , 'h' , 0xe9 },
1379+ []byte {0xE2 , 0x80 , 0x98 }... )... ), // UTF-8 ' (3 bytes)
1380+ length : 255 ,
13821381 charset : "latin1" ,
1383- wantRead : 6 ,
1382+ wantStr : "âfhéâ " , // UTF-8 smart quote decoded as Latin-1
1383+ wantRead : 8 ,
13841384 },
13851385 {
1386- name : "Short Latin1 string with ”" ,
1387- input : append (
1388- append ([]byte {7 }, []byte {0xe2 , 'f' , 'h' , 0xe9 }... ),
1389- []byte ("”" )... ),
1390- length : 5 ,
1391- wantStr : "âfhé\" " ,
1386+ name : "Short Latin1 string with UTF-8 smart quote " ,
1387+ input : append ([]byte {7 }, append (
1388+ []byte {0xe2 , 'f' , 'h' , 0xe9 },
1389+ []byte {0xE2 , 0x80 , 0x9C }... )... ), // UTF-8 " (3 bytes)
1390+ length : 255 ,
13921391 charset : "latin1" ,
1393- wantRead : 6 ,
1392+ wantStr : "âfhéâ " , // UTF-8 smart quote decoded as Latin-1
1393+ wantRead : 8 ,
13941394 },
13951395 {
1396- name : "Short Latin1 string with “" ,
1397- input : append (
1398- append ([]byte {7 }, []byte {0xe2 , 'f' , 'h' , 0xe9 }... ),
1399- []byte ("“" )... ),
1400- length : 5 ,
1401- wantStr : "âfhé\" " ,
1396+ name : "Short Latin1 string with UTF-8 smart quote " ,
1397+ input : append ([]byte {7 }, append (
1398+ []byte {0xe2 , 'f' , 'h' , 0xe9 },
1399+ []byte {0xE2 , 0x80 , 0x9D }... )... ), // UTF-8 " (3 bytes)
1400+ length : 255 ,
14021401 charset : "latin1" ,
1403- wantRead : 6 ,
1402+ wantStr : "âfhéâ " , // UTF-8 smart quote decoded as Latin-1
1403+ wantRead : 8 ,
14041404 },
14051405 {
14061406 name : "Invalid UTF-8 valid Latin1" ,
@@ -1414,7 +1414,8 @@ func TestDecodeStringLatin1(t *testing.T) {
14141414 name : "Latin1 with null byte" ,
14151415 input : append ([]byte {4 }, []byte {'A' , 0x00 , 'B' , 'C' }... ), // A\0BC
14161416 length : 4 ,
1417- wantStr : "A\u0000 BC" ,
1417+ charset : "latin1" ,
1418+ wantStr : "A BC" , // null byte becomes space after sanitization
14181419 wantRead : 5 ,
14191420 },
14201421 {
@@ -1445,54 +1446,67 @@ func TestDecodeStringLatin1(t *testing.T) {
14451446 name : "Long string (>255, 2-byte length)" ,
14461447 input : func () []byte {
14471448 buf := new (bytes.Buffer )
1448- err := binary .Write (buf , binary .LittleEndian , uint16 (6 ))
1449- if err != nil {
1450- return nil
1451- }
1452- buf .Write ([]byte {0xe2 , 'f' , 'g' , 'h' , 0xe9 }) // 'âfghé'
1449+ binary .Write (buf , binary .LittleEndian , uint16 (6 ))
1450+ buf .Write ([]byte {0xe2 , 'f' , 'g' , 'h' , 0xe9 , 0x00 }) // 'âfghé\0'
14531451 return buf .Bytes ()
14541452 }(),
14551453 length : 300 ,
14561454 charset : "latin1" ,
1457- wantStr : "âfghé" ,
1458- wantRead : 7 ,
1455+ wantStr : "âfghé " , // null byte becomes space
1456+ wantRead : 8 ,
14591457 },
14601458 {
1461- name : "Term Date and Retro Term Policy" ,
1462- input : append ([]byte {byte (len ("‘30 day term date’" ))}, []byte ("‘30 day term date’" )... ),
1463- length : len ("‘30 day term date’" ),
1464- wantStr : "'30 day term date'" ,
1459+ name : "UTF-8 smart quotes in text" ,
1460+ input : append ([]byte {22 }, append (
1461+ []byte {0xE2 , 0x80 , 0x98 }, // 3 bytes
1462+ append ([]byte ("30 day term date" ), // 16 bytes
1463+ []byte {0xE2 , 0x80 , 0x99 }... )... )... ), // 3 bytes
1464+ length : 255 ,
14651465 charset : "latin1" ,
1466- wantRead : 19 , // Include the prepended length byte
1466+ wantStr : "â 30 day term dateâ " ,
1467+ wantRead : 23 ,
14671468 },
14681469 {
1469- name : "Term Date and Retro Term Policy" ,
1470- input : append ([]byte {byte (len ("“30 day term date”" ))}, []byte ("“30 day term date”" )... ),
1471- length : len ("“30 day term date”" ),
1472- wantStr : "\" 30 day term date\" " ,
1470+ name : "UTF-8 double quotes in text" ,
1471+ input : append ([]byte {22 }, append (
1472+ []byte {0xE2 , 0x80 , 0x9C },
1473+ append ([]byte ("30 day term date" ), // 16 bytes
1474+ []byte {0xE2 , 0x80 , 0x9D }... )... )... ), // 3 bytes
1475+ length : 255 ,
14731476 charset : "latin1" ,
1474- wantRead : 19 , // Include the prepended length byte
1477+ wantStr : "â 30 day term dateâ " ,
1478+ wantRead : 23 , // 1 (length byte) + 22 (content)
14751479 },
1476-
14771480 {
1478- name : "UTF-8 followed by Latin1" ,
1479- input : func () []byte {
1480- data := []byte { // Hello' âfghé
1481- 'H' , 'e' , 'l' , 'l' , 'o' , ' ' , 0x27 , ' ' , 0xe2 , 'f' , 'g' , 'h' , 0xe9 ,
1482- }
1483- return append ([]byte {13 }, data ... ) // Prepend length byte
1484- }(),
1485- length : 12 ,
1486- wantStr : "Hello ' âfghé" ,
1481+ name : "UTF-8 followed by Latin1" ,
1482+ input : append ([]byte {13 }, []byte {'H' , 'e' , 'l' , 'l' , 'o' , ' ' , '\'' , ' ' , 0xe2 , 'f' , 'g' , 'h' , 0xe9 }... ),
1483+ length : 255 ,
14871484 charset : "latin1" ,
1485+ wantStr : "Hello ' âfghé" ,
14881486 wantRead : 14 ,
14891487 },
14901488 {
1491- name : "UTF-8 with Latin1 byte after UTF-8 valid chars " ,
1492- input : append ([]byte {7 }, []byte {0xe2 , ' ' , 'H' , 'e' , 'l' , 'l' , 'o' }... ), // 'Hello â'
1489+ name : "Latin1 byte followed by ASCII " ,
1490+ input : append ([]byte {7 }, []byte {0xe2 , ' ' , 'H' , 'e' , 'l' , 'l' , 'o' }... ),
14931491 length : 7 ,
1492+ charset : "latin1" ,
14941493 wantStr : "â Hello" ,
1494+ wantRead : 8 ,
1495+ },
1496+ {
1497+ name : "Windows-1252 single-byte smart quote (0x92)" ,
1498+ input : append ([]byte {5 }, []byte {'J' , 'o' , 'h' , 'n' , 0x92 }... ), // John' in Windows-1252
1499+ length : 5 ,
1500+ charset : "latin1" ,
1501+ wantStr : "John " , // 0x92 is control character in Latin-1, becomes space
1502+ wantRead : 6 ,
1503+ },
1504+ {
1505+ name : "Windows-1252 double quotes (0x93, 0x94)" ,
1506+ input : append ([]byte {7 }, []byte {0x93 , 'H' , 'e' , 'l' , 'l' , 'o' , 0x94 }... ),
1507+ length : 7 ,
14951508 charset : "latin1" ,
1509+ wantStr : " Hello " , // 0x93 and 0x94 are control chars in Latin-1
14961510 wantRead : 8 ,
14971511 },
14981512 }
0 commit comments