Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
dc9b9cb
Fix charset detection with utf16 and others
Siedlerchr Jul 3, 2022
b2f9b4b
checkstyöe
Siedlerchr Jul 3, 2022
533f905
Fix typo in method names
koppor Jul 3, 2022
ca72dff
change newlines
Siedlerchr Jul 3, 2022
1082f8a
get bytes
Siedlerchr Jul 3, 2022
455ae72
Set newline character to LF
HoussemNasri Jul 3, 2022
c1c5eac
Merge remote-tracking branch 'upstream/detectCharset' into detectCharset
HoussemNasri Jul 3, 2022
e3085b1
Revert "get bytes"
HoussemNasri Jul 3, 2022
8d7a38a
progress
HoussemNasri Jul 4, 2022
59f3ffc
switch line sep to LF
Siedlerchr Jul 4, 2022
f93cfb8
Please work
HoussemNasri Jul 4, 2022
ded49cd
Merge remote-tracking branch 'upstream/detectCharset' into detectCharset
HoussemNasri Jul 4, 2022
00dee77
Merge remote-tracking branch 'upstream/main' into detectCharset
Siedlerchr Jul 4, 2022
832a4bd
Merge remote-tracking branch 'upstream/main' into detectCharset
Siedlerchr Jul 6, 2022
758de5b
Try jitpack
koppor Jul 7, 2022
6287508
Merge branch 'detectCharset' of github.com:JabRef/jabref into detectC…
Siedlerchr Jul 9, 2022
cf59352
Merge remote-tracking branch 'upstream/main' into detectCharset
Siedlerchr Jul 10, 2022
680de72
Add manual build of icu4j
Siedlerchr Jul 10, 2022
28e1616
Check if we have ascii in the list of charsets
Siedlerchr Jul 10, 2022
787d594
fix checkstyle
Siedlerchr Jul 10, 2022
b3795fd
Update external-libraries.md
koppor Jul 10, 2022
164c629
Enocde with UTF-16BE
koppor Jul 10, 2022
67ffee9
Fix umlaut
koppor Jul 10, 2022
4d4a124
Hack to get test running
koppor Jul 10, 2022
d79dfb7
Also compare meta data
koppor Jul 10, 2022
63e7882
Add enforced ignorance of malformed characters
koppor Jul 10, 2022
b5e915d
checkstyle
Siedlerchr Jul 10, 2022
f8df61b
IntelliJ now also renders the file correctly
koppor Jul 10, 2022
2a4c1dd
Add test
koppor Jul 10, 2022
34c8210
Refine CHANGELOG.md
koppor Jul 10, 2022
040ee18
Remove non-working jpackage reference
koppor Jul 10, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ Note that this project **does not** adhere to [Semantic Versioning](http://semve
- We improved the message explaining the options when modifying an automatic keyword group [#8911](https://github.com/JabRef/jabref/issues/8911)
- We moved the preferences option "Warn about duplicates on import" option from the tab "File" to the tab "Import and Export". [koppor#570](https://github.com/koppor/jabref/issues/570)
- When JabRef encounters `% Encoding: UTF-8` header, it is kept during writing (and not removed). [#8964](https://github.com/JabRef/jabref/pull/8964)
- We replace characters which cannot be decoded using the specified encoding by a (probably another) valid character. This happens if JabRef detects the wrong charset (e.g., UTF-8 instead of Windows 1252). One can use the [Integrity Check](https://docs.jabref.org/finding-sorting-and-cleaning-entries/checkintegrity) to find those characters.

### Fixed

Expand Down
3 changes: 1 addition & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ dependencyLocking {
}

javafx {
version = "18"
version = "18.0.1"
modules = [ 'javafx.controls', 'javafx.fxml', 'javafx.web', 'javafx.swing' ]
}

Expand All @@ -127,7 +127,6 @@ dependencies {
implementation 'com.h2database:h2-mvstore:2.1.214'

implementation group: 'org.apache.tika', name: 'tika-core', version: '2.4.1'
implementation 'com.ibm.icu:icu4j-charset:71.1'

// required for reading write-protected PDFs - see https://github.com/JabRef/jabref/pull/942#issuecomment-209252635
implementation 'org.bouncycastle:bcprov-jdk18on:1.71'
Expand Down
76 changes: 38 additions & 38 deletions external-libraries.md
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,9 @@ Id: com.ibm.icu:*
Project: International Components for Unicode
URL: https://icu.unicode.org/
License: Unicode License (https://www.unicode.org/copyright.html)
Note: Our own fork https://github.com/JabRef/icu. Upstream PR: https://github.com/unicode-org/icu/pull/2127
Path: lib/icu4j.jar
SourcePath: lib/ic4j-src.jar
```

```yaml
Expand Down Expand Up @@ -519,25 +522,24 @@ License: Apache-2.0
3. (on WSL) `sed 's/[^a-z]*//' < build/dependencies.txt | sed "s/\(.*\) .*/\1/" | grep -v "\->" | sort | uniq > build/dependencies-for-external-libraries.txt`

```text
com.fasterxml.jackson.core:jackson-annotations:2.13.2
com.fasterxml.jackson.core:jackson-core:2.13.2
com.fasterxml.jackson.core:jackson-databind:2.13.2
com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.13.2
com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.13.2
com.fasterxml.jackson:jackson-bom:2.13.2
com.github.tomtung:latex2unicode_2.12:0.2.6
com.fasterxml.jackson.core:jackson-annotations:2.13.3
com.fasterxml.jackson.core:jackson-core:2.13.3
com.fasterxml.jackson.core:jackson-databind:2.13.3
com.fasterxml.jackson.dataformat:jackson-dataformat-yaml:2.13.3
com.fasterxml.jackson.datatype:jackson-datatype-jsr310:2.13.3
com.fasterxml.jackson:jackson-bom:2.13.3
com.github.sialcasa.mvvmFX:mvvmfx-validation:f195849ca9
com.github.tomtung:latex2unicode_2.13:0.3.2
com.google.code.gson:gson:2.9.0
com.google.errorprone:error_prone_annotations:2.11.0
com.google.guava:failureaccess:1.0.1
com.google.guava:guava:31.1-jre
com.google.guava:listenablefuture:9999.0-empty-to-avoid-conflict-with-guava
com.google.j2objc:j2objc-annotations:1.3
com.googlecode.javaewah:JavaEWAH:1.1.13
com.h2database:h2-mvstore:2.1.212
com.ibm.icu:icu4j-charset:70.1
com.ibm.icu:icu4j:70.1
com.h2database:h2-mvstore:2.1.214
com.jfoenix:jfoenix:9.0.10
com.konghq:unirest-java:3.13.8
com.konghq:unirest-java:3.13.10
com.microsoft.azure:applicationinsights-core:2.4.1
com.microsoft.azure:applicationinsights-logging-log4j2:2.4.1
com.oracle.ojdbc:ojdbc10:19.3.0.0
Expand Down Expand Up @@ -565,10 +567,9 @@ com.vladsch.flexmark:flexmark-util-visitor:0.64.0
com.vladsch.flexmark:flexmark-util:0.64.0
com.vladsch.flexmark:flexmark:0.64.0
commons-cli:commons-cli:1.5.0
commons-codec:commons-codec:1.11
commons-codec:commons-codec:1.15
commons-io:commons-io:2.11.0
commons-logging:commons-logging:1.2
de.saxsys:mvvmfx-validation:1.9.0-SNAPSHOT
de.saxsys:mvvmfx:1.8.0
de.undercouch:citeproc-java:3.0.0-alpha.6
eu.lestard:doc-annotations:0.2
Expand All @@ -581,7 +582,7 @@ net.java.dev.jna:jna-platform:5.6.0
net.java.dev.jna:jna:5.6.0
net.jcip:jcip-annotations:1.0
net.jodah:typetools:0.6.1
org.antlr:antlr-runtime:3.5.2
org.antlr:antlr-runtime:3.5.3
org.antlr:antlr4-runtime:4.9.3
org.apache.commons:commons-csv:1.9.0
org.apache.commons:commons-lang3:3.12.0
Expand All @@ -590,23 +591,22 @@ org.apache.httpcomponents:httpclient:4.5.13
org.apache.httpcomponents:httpcore-nio:4.4.13
org.apache.httpcomponents:httpcore:4.4.13
org.apache.httpcomponents:httpmime:4.5.13
org.apache.lucene:lucene-analysis-common:9.1.0
org.apache.lucene:lucene-backward-codecs:9.1.0
org.apache.lucene:lucene-core:9.1.0
org.apache.lucene:lucene-highlighter:9.1.0
org.apache.lucene:lucene-queries:9.1.0
org.apache.lucene:lucene-queryparser:9.1.0
org.apache.lucene:lucene-sandbox:9.1.0
org.apache.lucene:lucene-analysis-common:9.2.0
org.apache.lucene:lucene-core:9.2.0
org.apache.lucene:lucene-highlighter:9.2.0
org.apache.lucene:lucene-queries:9.2.0
org.apache.lucene:lucene-queryparser:9.2.0
org.apache.lucene:lucene-sandbox:9.2.0
org.apache.pdfbox:fontbox:3.0.0-RC1
org.apache.pdfbox:pdfbox:3.0.0-RC1
org.apache.pdfbox:xmpbox:3.0.0-RC1
org.apache.tika:tika-core:2.3.0
org.bouncycastle:bcprov-jdk15on:1.70
org.apache.tika:tika-core:2.4.1
org.bouncycastle:bcprov-jdk18on:1.71
org.checkerframework:checker-qual:3.12.0
org.codehaus.mojo:animal-sniffer-annotations:1.18
org.controlsfx:controlsfx:11.1.1
org.eclipse.jgit:org.eclipse.jgit:6.1.0.202203080745-r
org.fxmisc.flowless:flowless:0.6.9
org.eclipse.jgit:org.eclipse.jgit:6.2.0.202206071550-r
org.fxmisc.flowless:flowless:0.6.10
org.fxmisc.richtext:richtextfx:0.10.9
org.fxmisc.undo:undofx:2.1.1
org.fxmisc.wellbehaved:wellbehavedfx:0.3.3
Expand All @@ -616,23 +616,23 @@ org.glassfish.jaxb:jaxb-runtime:3.0.2
org.glassfish.jaxb:txw2:3.0.2
org.jbibtex:jbibtex:1.0.19
org.jetbrains:annotations:15.0
org.jsoup:jsoup:1.14.3
org.jsoup:jsoup:1.15.1
org.kordamp.ikonli:ikonli-core:12.3.1
org.kordamp.ikonli:ikonli-javafx:12.3.1
org.kordamp.ikonli:ikonli-materialdesign2-pack:12.3.1
org.libreoffice:libreoffice:7.3.2
org.libreoffice:unoloader:7.3.2
org.mariadb.jdbc:mariadb-java-client:2.7.5
org.openjfx:javafx-base:18
org.openjfx:javafx-controls:18
org.openjfx:javafx-fxml:18
org.openjfx:javafx-graphics:18
org.openjfx:javafx-media:18
org.openjfx:javafx-swing:18
org.openjfx:javafx-web:18
org.postgresql:postgresql:42.3.3
org.libreoffice:libreoffice:7.3.4
org.libreoffice:unoloader:7.3.4
org.mariadb.jdbc:mariadb-java-client:2.7.6
org.openjfx:javafx-base:18.0.1
org.openjfx:javafx-controls:18.0.1
org.openjfx:javafx-fxml:18.0.1
org.openjfx:javafx-graphics:18.0.1
org.openjfx:javafx-media:18.0.1
org.openjfx:javafx-swing:18.0.1
org.openjfx:javafx-web:18.0.1
org.postgresql:postgresql:42.4.0
org.reactfx:reactfx:2.0-M5
org.scala-lang:scala-library:2.12.8
org.scala-lang:scala-library:2.13.8
org.slf4j:slf4j-api:2.0.0-alpha7
org.tinylog:slf4j-tinylog:2.4.1
org.tinylog:tinylog-api:2.4.1
Expand Down
Binary file added lib/icu4j-src.jar
Binary file not shown.
Binary file added lib/icu4j.jar
Binary file not shown.
8 changes: 3 additions & 5 deletions src/main/java/org/jabref/logic/importer/Importer.java
Original file line number Diff line number Diff line change
Expand Up @@ -123,12 +123,10 @@ protected static Charset getCharset(BufferedInputStream bufferedInputStream) {
if ((matches == null) || (matches.length == 0)) {
return defaultCharSet;
}
if (Arrays.stream(matches).anyMatch(singleCharset -> singleCharset.getName().equals(defaultCharSet.toString()))) {

if (Arrays.stream(matches).anyMatch(charset -> "ASCII".equals(charset.getName()))) {
return defaultCharSet;
}
if (Arrays.stream(matches).anyMatch(singleCharset -> singleCharset.getName().equals(StandardCharsets.UTF_16.toString()))) {
return StandardCharsets.UTF_16;
}

if (matches[0] != null) {
return Charset.forName(matches[0].getName());
Expand Down Expand Up @@ -166,7 +164,7 @@ public static BufferedReader getReader(Path filePath) throws IOException {
return new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8));
}

public static BufferedReader getReader(InputStream stream) throws IOException {
public static BufferedReader getReader(InputStream stream) {
BufferedInputStream bufferedInputStream = new BufferedInputStream(stream);
Charset charset = getCharset(bufferedInputStream);
InputStreamReader reader = new InputStreamReader(bufferedInputStream, charset);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,10 @@
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CodingErrorAction;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.Objects;
Expand Down Expand Up @@ -79,7 +82,14 @@ public ParserResult importDatabase(Path filePath) throws IOException {
LOGGER.debug("Encoding used to read the file: {}", encoding);
}

try (BufferedReader reader = Files.newBufferedReader(filePath, encoding)) {
// We replace unreadable characters
// Unfortunately, no warning will be issued to the user
// As this is a very seldom case, we accept that
CharsetDecoder decoder = encoding.newDecoder();
decoder.onMalformedInput(CodingErrorAction.REPLACE);

try (InputStreamReader inputStreamReader = new InputStreamReader(Files.newInputStream(filePath), decoder);
BufferedReader reader = new BufferedReader(inputStreamReader)) {
ParserResult parserResult = this.importDatabase(reader);
parserResult.getMetaData().setEncoding(encoding);
parserResult.getMetaData().setEncodingExplicitlySupplied(encodingExplicitlySupplied);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,12 @@
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.util.StandardFileType;
import org.jabref.model.database.BibDatabaseMode;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.entry.field.StandardField;
import org.jabref.model.entry.field.UnknownField;
import org.jabref.model.entry.types.StandardEntryType;
import org.jabref.model.metadata.MetaData;
import org.jabref.model.util.DummyFileUpdateMonitor;

import org.junit.jupiter.api.BeforeEach;
Expand Down Expand Up @@ -170,13 +172,40 @@ public void testParsingOfUtf8EncodedFileReadsUmlautCharacterCorrectly(String fil
parserResult.getDatabase().getEntries());
}

@ParameterizedTest
@CsvSource({"encoding-utf-16BE-with-header.bib", "encoding-utf-16BE-without-header.bib"})
public void testParsingOfUtf16EncodedFileReadsUmlautCharacterCorrectly(String filename) throws Exception {
ParserResult parserResult = importer.importDatabase(
Path.of(BibtexImporterTest.class.getResource(filename).toURI()));

assertEquals(
List.of(new BibEntry(StandardEntryType.Article).withField(StandardField.TITLE, "Ü ist ein Umlaut")),
parserResult.getDatabase().getEntries());

MetaData metaData = new MetaData();
metaData.setMode(BibDatabaseMode.BIBTEX);
metaData.setEncoding(StandardCharsets.UTF_16BE);
assertEquals(metaData, parserResult.getMetaData());
}

@Test
public void encodingSupplied() throws Exception {
ParserResult parserResult = importer.importDatabase(
Path.of(BibtexImporterTest.class.getResource("encoding-utf-8-with-header.bib").toURI()));
assertTrue(parserResult.getMetaData().getEncodingExplicitlySupplied());
}

@Test
public void wrongEncodingSupplied() throws Exception {
ParserResult parserResult = importer.importDatabase(
Path.of(BibtexImporterTest.class.getResource("encoding-windows-1252-but-utf-8-declared--decoding-fails.bib").toURI()));

// The test file contains "Test{NBSP}I. Last" where the character "{NBSP}" is encoded using Windows-1252 instead of UTF-8
assertEquals(
List.of(new BibEntry(StandardEntryType.Article).withField(StandardField.AUTHOR, "Test�I. Last")),
parserResult.getDatabase().getEntries());
}

@Test
public void encodingNotSupplied() throws Exception {
ParserResult parserResult = importer.importDatabase(
Expand Down
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
% Encoding: UTF-8

@Article{,
author = {Test�I. Last},
}