Merge pull request #26 from RiveryIO/fix/eitam/safe_data_handling

eitamring · web-flow · commit 4fcdc09c19c1 · 2025-09-02T09:14:03.000+03:00
Fix/eitam/safe data handling
diff --git a/canal/canal.go b/canal/canal.go
@@ -436,27 +436,29 @@ func isSafeIdentifier(s string) bool {
 
 func (c *Canal) GenerateCharsetQuery() (string, error) {
 	query := `
-		SELECT 
-			c.ORDINAL_POSITION,
-			CASE 
-				WHEN c.CHARACTER_SET_NAME IS NOT NULL THEN c.CHARACTER_SET_NAME
-				WHEN c.DATA_TYPE IN ('binary','varbinary','tinyblob','blob','mediumblob','longblob') THEN col.CHARACTER_SET_NAME
-			END AS CHARACTER_SET_NAME,
-			c.COLUMN_NAME
-		FROM 
-			information_schema.COLUMNS c
-		LEFT JOIN information_schema.TABLES t
-			ON t.TABLE_SCHEMA = c.TABLE_SCHEMA AND t.TABLE_NAME = c.TABLE_NAME
-		LEFT JOIN information_schema.COLLATIONS col
-			ON col.COLLATION_NAME = t.TABLE_COLLATION
-		WHERE 
-			c.TABLE_SCHEMA = ?
-			AND c.TABLE_NAME = ?
-			AND (c.CHARACTER_SET_NAME IS NOT NULL OR c.DATA_TYPE IN ('binary','varbinary','tinyblob','blob','mediumblob','longblob'));
-	`
+       SELECT 
+          c.ORDINAL_POSITION,
+          CASE 
+             WHEN c.CHARACTER_SET_NAME IS NOT NULL THEN c.CHARACTER_SET_NAME
+             WHEN c.DATA_TYPE IN ('binary','varbinary','tinyblob','blob','mediumblob','longblob') THEN col.CHARACTER_SET_NAME
+             ELSE col.CHARACTER_SET_NAME
+          END AS CHARACTER_SET_NAME,
+          c.COLUMN_NAME
+       FROM 
+          information_schema.COLUMNS c
+       LEFT JOIN information_schema.TABLES t
+          ON t.TABLE_SCHEMA = c.TABLE_SCHEMA AND t.TABLE_NAME = c.TABLE_NAME
+       LEFT JOIN information_schema.COLLATIONS col
+          ON col.COLLATION_NAME = t.TABLE_COLLATION
+       WHERE 
+          c.TABLE_SCHEMA = ?
+          AND c.TABLE_NAME = ?
+          AND (c.CHARACTER_SET_NAME IS NOT NULL 
+               OR c.DATA_TYPE IN ('binary','varbinary','tinyblob','blob','mediumblob','longblob')
+               OR c.DATA_TYPE IN ('varchar','char','text','tinytext','mediumtext','longtext'));
+    `
 
 	return query, nil
-
 }
 
 func (c *Canal) setColumnsCharsetFromRows(tableRegex string, rows *sql.Rows) error {
@@ -507,20 +509,19 @@ func (c *Canal) GetColumnsCharsets() error {
 		if err != nil {
 			return fmt.Errorf("failed to generate charset query: %w", err)
 		}
-
 		rows, err := db.QueryContext(c.ctx, query, dbName, tableName)
 		if err != nil {
 			return fmt.Errorf("error occurred while executing query: %s on db: %s on table: %s. error: %v",
 				query, dbName, tableName, errors.Trace(err))
 		}
-
 		// Ensure rows are closed after processing
 		func() {
 			defer rows.Close()
 			if err := c.setColumnsCharsetFromRows(tableRegex, rows); err != nil {
 				panic(fmt.Errorf("failed to set charset from rows: %w", err))
 			}
 		}()
+
 	}
 
 	return nil
diff --git a/replication/row_event.go b/replication/row_event.go
@@ -1363,66 +1363,24 @@ func supportsSmartQuotes(enc encoding.Encoding) bool {
 	return false
 }
 
-func replaceUnsupportedCharacters(data []byte, length int) []byte {
-	if len(data) == 0 {
-		return data
-	}
-
-	var content []byte
-	var prefix []byte
-	var contentLength int
-	var prefixLen int
-
-	if length > 255 {
-		// 2-byte length prefix (LittleEndian)
-		prefixLen = 2
-		contentLength = int(binary.LittleEndian.Uint16(data[:2]))
-		if contentLength > len(data)-prefixLen {
-			contentLength = len(data) - prefixLen
-		}
-		content = data[prefixLen : prefixLen+contentLength]
-	} else {
-		// 1-byte length prefix
-		prefixLen = 1
-		contentLength = int(data[0])
-		if contentLength > len(data)-prefixLen {
-			contentLength = len(data) - prefixLen
-		}
-		content = data[prefixLen : prefixLen+contentLength]
-	}
-
-	// Replace unsupported characters
-	content = normalizeSmartQuotes(content)
-
-	// Rebuild prefix with new length
-	if prefixLen == 2 {
-		prefix = make([]byte, 2)
-		binary.LittleEndian.PutUint16(prefix, uint16(len(content)))
-	} else {
-		prefix = []byte{byte(len(content))}
-	}
-
-	return append(prefix, content...)
-}
-
 func decodeStringWithEncoder(data []byte, length int, enc encoding.Encoding) (v string, n int) {
-	// Define the Latin1 decoder
 	decoder := enc.NewDecoder()
-	if !supportsSmartQuotes(enc) {
-		data = replaceUnsupportedCharacters(data, length)
-	}
 
 	if length < 256 {
-		// If the length is smaller than 256, extract the length from the first byte
 		length = int(data[0])
 		n = length + 1
 		decodedBytes, _, _ := transform.Bytes(decoder, data[1:n])
+		if !supportsSmartQuotes(enc) {
+			decodedBytes = normalizeSmartQuotes(decodedBytes)
+		}
 		v = string(decodedBytes)
 	} else {
-		// If the length is larger, extract it using LittleEndian
 		length = int(binary.LittleEndian.Uint16(data[0:]))
 		n = length + 2
 		decodedBytes, _, _ := transform.Bytes(decoder, data[2:n])
+		if !supportsSmartQuotes(enc) {
+			decodedBytes = normalizeSmartQuotes(decodedBytes)
+		}
 		v = string(decodedBytes)
 	}
 
diff --git a/replication/row_event_test.go b/replication/row_event_test.go
@@ -1363,44 +1363,44 @@ func TestDecodeStringLatin1(t *testing.T) {
 			wantRead: 6,
 		},
 		{
-			name: "Short Latin1 string with ‘",
-			input: append(
-				append([]byte{7}, []byte{0xe2, 'f', 'h', 0xe9}...),
-				[]byte("‘")...),
-			length:   5,
-			wantStr:  "âfhé'",
+			name: "Short Latin1 string with UTF-8 smart quote '",
+			input: append([]byte{7}, append(
+				[]byte{0xe2, 'f', 'h', 0xe9},
+				[]byte{0xE2, 0x80, 0x99}...)...), // UTF-8 ' (3 bytes)
+			length:   255,
 			charset:  "latin1",
-			wantRead: 6,
+			wantStr:  "âfhéâ  ", // UTF-8 smart quote decoded as Latin-1 becomes "â  "
+			wantRead: 8,
 		},
 		{
-			name: "Short Latin1 string with ’",
-			input: append(
-				append([]byte{7}, []byte{0xe2, 'f', 'h', 0xe9}...),
-				[]byte("’")...),
-			length:   5,
-			wantStr:  "âfhé'",
+			name: "Short Latin1 string with UTF-8 smart quote '",
+			input: append([]byte{7}, append(
+				[]byte{0xe2, 'f', 'h', 0xe9},
+				[]byte{0xE2, 0x80, 0x98}...)...), // UTF-8 ' (3 bytes)
+			length:   255,
 			charset:  "latin1",
-			wantRead: 6,
+			wantStr:  "âfhéâ  ", // UTF-8 smart quote decoded as Latin-1
+			wantRead: 8,
 		},
 		{
-			name: "Short Latin1 string with ”",
-			input: append(
-				append([]byte{7}, []byte{0xe2, 'f', 'h', 0xe9}...),
-				[]byte("”")...),
-			length:   5,
-			wantStr:  "âfhé\"",
+			name: "Short Latin1 string with UTF-8 smart quote ",
+			input: append([]byte{7}, append(
+				[]byte{0xe2, 'f', 'h', 0xe9},
+				[]byte{0xE2, 0x80, 0x9C}...)...), // UTF-8 " (3 bytes)
+			length:   255,
 			charset:  "latin1",
-			wantRead: 6,
+			wantStr:  "âfhéâ  ", // UTF-8 smart quote decoded as Latin-1
+			wantRead: 8,
 		},
 		{
-			name: "Short Latin1 string with “",
-			input: append(
-				append([]byte{7}, []byte{0xe2, 'f', 'h', 0xe9}...),
-				[]byte("“")...),
-			length:   5,
-			wantStr:  "âfhé\"",
+			name: "Short Latin1 string with UTF-8 smart quote ",
+			input: append([]byte{7}, append(
+				[]byte{0xe2, 'f', 'h', 0xe9},
+				[]byte{0xE2, 0x80, 0x9D}...)...), // UTF-8 " (3 bytes)
+			length:   255,
 			charset:  "latin1",
-			wantRead: 6,
+			wantStr:  "âfhéâ  ", // UTF-8 smart quote decoded as Latin-1
+			wantRead: 8,
 		},
 		{
 			name:     "Invalid UTF-8 valid Latin1",
@@ -1414,7 +1414,8 @@ func TestDecodeStringLatin1(t *testing.T) {
 			name:     "Latin1 with null byte",
 			input:    append([]byte{4}, []byte{'A', 0x00, 'B', 'C'}...), // A\0BC
 			length:   4,
-			wantStr:  "A\u0000BC",
+			charset:  "latin1",
+			wantStr:  "A BC", // null byte becomes space after sanitization
 			wantRead: 5,
 		},
 		{
@@ -1445,54 +1446,67 @@ func TestDecodeStringLatin1(t *testing.T) {
 			name: "Long string (>255, 2-byte length)",
 			input: func() []byte {
 				buf := new(bytes.Buffer)
-				err := binary.Write(buf, binary.LittleEndian, uint16(6))
-				if err != nil {
-					return nil
-				}
-				buf.Write([]byte{0xe2, 'f', 'g', 'h', 0xe9}) // 'âfghé'
+				binary.Write(buf, binary.LittleEndian, uint16(6))
+				buf.Write([]byte{0xe2, 'f', 'g', 'h', 0xe9, 0x00}) // 'âfghé\0'
 				return buf.Bytes()
 			}(),
 			length:   300,
 			charset:  "latin1",
-			wantStr:  "âfghé",
-			wantRead: 7,
+			wantStr:  "âfghé ", // null byte becomes space
+			wantRead: 8,
 		},
 		{
-			name:     "Term Date and Retro Term Policy",
-			input:    append([]byte{byte(len("‘30 day term date’"))}, []byte("‘30 day term date’")...),
-			length:   len("‘30 day term date’"),
-			wantStr:  "'30 day term date'",
+			name: "UTF-8 smart quotes in text",
+			input: append([]byte{22}, append(
+				[]byte{0xE2, 0x80, 0x98}, // 3 bytes
+				append([]byte("30 day term date"), // 16 bytes
+					[]byte{0xE2, 0x80, 0x99}...)...)...), // 3 bytes
+			length:   255,
 			charset:  "latin1",
-			wantRead: 19, // Include the prepended length byte
+			wantStr:  "â  30 day term dateâ  ",
+			wantRead: 23,
 		},
 		{
-			name:     "Term Date and Retro Term Policy",
-			input:    append([]byte{byte(len("“30 day term date”"))}, []byte("“30 day term date”")...),
-			length:   len("“30 day term date”"),
-			wantStr:  "\"30 day term date\"",
+			name: "UTF-8 double quotes in text",
+			input: append([]byte{22}, append(
+				[]byte{0xE2, 0x80, 0x9C},
+				append([]byte("30 day term date"), // 16 bytes
+					[]byte{0xE2, 0x80, 0x9D}...)...)...), // 3 bytes
+			length:   255,
 			charset:  "latin1",
-			wantRead: 19, // Include the prepended length byte
+			wantStr:  "â  30 day term dateâ  ",
+			wantRead: 23, // 1 (length byte) + 22 (content)
 		},
-
 		{
-			name: "UTF-8 followed by Latin1",
-			input: func() []byte {
-				data := []byte{ // Hello' âfghé
-					'H', 'e', 'l', 'l', 'o', ' ', 0x27, ' ', 0xe2, 'f', 'g', 'h', 0xe9,
-				}
-				return append([]byte{13}, data...) // Prepend length byte
-			}(),
-			length:   12,
-			wantStr:  "Hello ' âfghé",
+			name:     "UTF-8 followed by Latin1",
+			input:    append([]byte{13}, []byte{'H', 'e', 'l', 'l', 'o', ' ', '\'', ' ', 0xe2, 'f', 'g', 'h', 0xe9}...),
+			length:   255,
 			charset:  "latin1",
+			wantStr:  "Hello ' âfghé",
 			wantRead: 14,
 		},
 		{
-			name:     "UTF-8 with Latin1 byte after UTF-8 valid chars",
-			input:    append([]byte{7}, []byte{0xe2, ' ', 'H', 'e', 'l', 'l', 'o'}...), // 'Hello â'
+			name:     "Latin1 byte followed by ASCII",
+			input:    append([]byte{7}, []byte{0xe2, ' ', 'H', 'e', 'l', 'l', 'o'}...),
 			length:   7,
+			charset:  "latin1",
 			wantStr:  "â Hello",
+			wantRead: 8,
+		},
+		{
+			name:     "Windows-1252 single-byte smart quote (0x92)",
+			input:    append([]byte{5}, []byte{'J', 'o', 'h', 'n', 0x92}...), // John' in Windows-1252
+			length:   5,
+			charset:  "latin1",
+			wantStr:  "John ", // 0x92 is control character in Latin-1, becomes space
+			wantRead: 6,
+		},
+		{
+			name:     "Windows-1252 double quotes (0x93, 0x94)",
+			input:    append([]byte{7}, []byte{0x93, 'H', 'e', 'l', 'l', 'o', 0x94}...),
+			length:   7,
 			charset:  "latin1",
+			wantStr:  " Hello ", // 0x93 and 0x94 are control chars in Latin-1
 			wantRead: 8,
 		},
 	}