diff --git a/metafacture-io/src/main/java/org/metafacture/io/FileOpener.java b/metafacture-io/src/main/java/org/metafacture/io/FileOpener.java index 2da6aa472..b88b9aabd 100644 --- a/metafacture-io/src/main/java/org/metafacture/io/FileOpener.java +++ b/metafacture-io/src/main/java/org/metafacture/io/FileOpener.java @@ -40,7 +40,7 @@ */ @Description("Opens a file.") @In(String.class) -@Out(java.io.Reader.class) +@Out(Reader.class) @FluxCommand("open-file") public final class FileOpener extends DefaultObjectPipe> { @@ -66,8 +66,7 @@ public String getEncoding() { /** * Sets the encoding used to open the resource. * - * @param encoding - * new encoding + * @param encoding new encoding */ public void setEncoding(final String encoding) { this.encoding = encoding; @@ -83,7 +82,7 @@ public FileCompression getCompression() { } /** - * * Sets the compression of the file. + * Sets the compression of the file. * * @param compression the {@link FileCompression} */ @@ -94,7 +93,7 @@ public void setCompression(final FileCompression compression) { /** * Sets the compression of the file. * - * @param compression the name of the compression. + * @param compression the name of the compression */ public void setCompression(final String compression) { setCompression(FileCompression.valueOf(compression.toUpperCase())); @@ -112,35 +111,52 @@ public boolean getDecompressConcatenated() { /** * Flags whether to use decompress concatenated file compression. * - * @param decompressConcatenated true if file compression should be decompresses - * concatenated + * @param decompressConcatenated true if file compression should decompress concatenated */ public void setDecompressConcatenated(final boolean decompressConcatenated) { this.decompressConcatenated = decompressConcatenated; } - @Override - public void process(final String file) { + /** + * Opens a file. + * + * @param file the file + * @return a Reader + * @throws IOException if an I/O error occurs + */ + public Reader open(final String file) throws IOException { + return open(new FileInputStream(file)); + } + + /** + * Opens a file stream. + * + * @param stream the stream + * @return a Reader + * @throws IOException if an I/O error occurs + */ + public Reader open(final InputStream stream) throws IOException { try { - final InputStream fileStream = new FileInputStream(file); + final InputStream decompressor = compression.createDecompressor(stream, decompressConcatenated); try { - final InputStream decompressor = compression.createDecompressor(fileStream, decompressConcatenated); - try { - - final Reader reader = new InputStreamReader(new BOMInputStream( - decompressor), encoding); - getReceiver().process(reader); - } - catch (final IOException | MetafactureException e) { - decompressor.close(); - throw e; - } + return new InputStreamReader(new BOMInputStream(decompressor), encoding); } catch (final IOException | MetafactureException e) { - fileStream.close(); + decompressor.close(); throw e; } } + catch (final IOException | MetafactureException e) { + stream.close(); + throw e; + } + } + + @Override + public void process(final String file) { + try { + getReceiver().process(open(file)); + } catch (final IOException e) { throw new MetafactureException(e); } diff --git a/metamorph/build.gradle b/metamorph/build.gradle index a14f783d3..0f57ceea3 100644 --- a/metamorph/build.gradle +++ b/metamorph/build.gradle @@ -22,6 +22,7 @@ dependencies { api project(':metamorph-api') implementation project(':metafacture-commons') implementation project(':metafacture-flowcontrol') + implementation project(':metafacture-io') implementation project(':metafacture-mangling') implementation project(':metafacture-javaintegration') implementation 'org.slf4j:slf4j-api:1.7.21' diff --git a/metamorph/src/main/java/org/metafacture/metamorph/maps/FileMap.java b/metamorph/src/main/java/org/metafacture/metamorph/maps/FileMap.java index 1972e362a..53aa28c03 100644 --- a/metamorph/src/main/java/org/metafacture/metamorph/maps/FileMap.java +++ b/metamorph/src/main/java/org/metafacture/metamorph/maps/FileMap.java @@ -16,6 +16,7 @@ package org.metafacture.metamorph.maps; +import org.metafacture.io.FileOpener; import org.metafacture.metamorph.api.MorphExecutionException; import org.metafacture.metamorph.api.helpers.AbstractReadOnlyMap; @@ -24,11 +25,10 @@ import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; -import java.io.InputStreamReader; +import java.io.Reader; import java.io.UncheckedIOException; import java.net.MalformedURLException; import java.net.URL; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; @@ -38,8 +38,11 @@ import java.util.regex.Pattern; /** - * Provides a {@link Map} based on files. Can be one file or a comma separated list of files. - * The files are supposed to be UTF-8 encoded. The default separator is {@code \t}. + * Provides a {@link Map} based on files. Can be a single file or a + * comma-separated list of files. + * + * The default {@link #setEncoding encoding} is UTF-8. + * The default {@link #setSeparator separator} is {@code \t}. * * By setting {@link #allowEmptyValues} to {@code true} the values in the * {@link Map} can be empty thus enabling e.g. @@ -53,6 +56,7 @@ */ public final class FileMap extends AbstractReadOnlyMap { + private final FileOpener fileOpener = new FileOpener(); private final Map map = new HashMap<>(); private Pattern split = Pattern.compile("\t", Pattern.LITERAL); @@ -100,6 +104,33 @@ public void setFile(final String file) { Collections.addAll(filenames, file); } + /** + * Sets the encoding used to open the resource. + * + * @param encoding new encoding + */ + public void setEncoding(final String encoding) { + fileOpener.setEncoding(encoding); + } + + /** + * Sets the compression of the file. + * + * @param compression the name of the compression + */ + public void setCompression(final String compression) { + fileOpener.setCompression(compression); + } + + /** + * Flags whether to use decompress concatenated file compression. + * + * @param decompressConcatenated true if file compression should decompress concatenated + */ + public void setDecompressConcatenated(final boolean decompressConcatenated) { + fileOpener.setDecompressConcatenated(decompressConcatenated); + } + private void loadFiles() { filenames.forEach(this::loadFile); } @@ -107,10 +138,11 @@ private void loadFiles() { private void loadFile(final String file) { try ( InputStream stream = openStream(file); - BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)) + Reader reader = fileOpener.open(stream); + BufferedReader br = new BufferedReader(reader) ) { String line; - while ((line = reader.readLine()) != null) { + while ((line = br.readLine()) != null) { if (line.isEmpty()) { continue; } @@ -127,10 +159,9 @@ private void loadFile(final String file) { private InputStream openStream(final String file) { return openAsFile(file) - .orElseGet(() -> openAsResource(file) - .orElseGet(() -> openAsUrl(file) - .orElseThrow(() -> new MorphExecutionException( - "File not found: " + file)))); + .orElseGet(() -> openAsResource(file) + .orElseGet(() -> openAsUrl(file) + .orElseThrow(() -> new MorphExecutionException("File not found: " + file)))); } private Optional openAsFile(final String file) { @@ -166,7 +197,7 @@ private Optional openAsUrl(final String file) { /** * Sets the separator. * - * Default value: {@code \t} + * Default value: {@code \t} * * @param delimiter the separator */ diff --git a/metamorph/src/main/resources/schemata/metamorph.xsd b/metamorph/src/main/resources/schemata/metamorph.xsd index 7d3cc7cd0..0fe398e1e 100644 --- a/metamorph/src/main/resources/schemata/metamorph.xsd +++ b/metamorph/src/main/resources/schemata/metamorph.xsd @@ -602,6 +602,21 @@ Allow empty values in Map. + + + Sets the compression of the file. + + + + + Flags whether to use decompress concatenated file compression. + + + + + Sets the encoding used to open the resource. + + Filename(s) referencing the lookup table(s). Can be one diff --git a/metamorph/src/test/java/org/metafacture/metamorph/maps/FileMapTest.java b/metamorph/src/test/java/org/metafacture/metamorph/maps/FileMapTest.java index e55b06679..ee92795bf 100644 --- a/metamorph/src/test/java/org/metafacture/metamorph/maps/FileMapTest.java +++ b/metamorph/src/test/java/org/metafacture/metamorph/maps/FileMapTest.java @@ -46,13 +46,12 @@ public final class FileMapTest { " " + "" + "" + - " " + + " " + ""; @Test public void shouldLookupValuesInFileBasedMap() { - assertMorph(receiver, String.format(MORPH, "lookup in", ""), + assertMorph(receiver, buildMorph("lookup in", ""), i -> { i.startRecord("1"); i.literal("1", "gw"); @@ -70,7 +69,7 @@ public void shouldLookupValuesInFileBasedMap() { @Test public void shouldWhitelistValuesInFileBasedMap() { - assertMorph(receiver, String.format(MORPH, "whitelist map", ""), + assertMorph(receiver, buildMorph("whitelist map", ""), i -> { i.startRecord("1"); i.literal("1", "gw"); @@ -89,7 +88,7 @@ public void shouldWhitelistValuesInFileBasedMap() { @Test public void shouldReplaceValuesUsingFileBasedMap() { - assertMorph(receiver, String.format(MORPH, "setreplace map", ""), + assertMorph(receiver, buildMorph("setreplace map", ""), i -> { i.startRecord("1"); i.literal("1", "gw-fj: 1:1"); @@ -107,7 +106,7 @@ public void shouldReplaceValuesUsingFileBasedMap() { @Test public void shouldReplaceCommaSeparatedValuesUsingFileBasedMapSetting() { - assertMorph(receiver, String.format(MORPH, "setreplace map", "separator=\",\""), + assertMorph(receiver, buildMorph("setreplace map", "separator=\",\""), i -> { i.startRecord("1"); i.literal("1", "gw"); @@ -125,7 +124,7 @@ public void shouldReplaceCommaSeparatedValuesUsingFileBasedMapSetting() { @Test public void shouldReplaceEmptyValuesUsingFileBasedMapSetting() { - assertMorph(receiver, String.format(MORPH, "setreplace map", "allowEmptyValues=\"true\""), + assertMorph(receiver, buildMorph("setreplace map", "allowEmptyValues=\"true\""), i -> { i.startRecord("1"); i.literal("1", "zz"); @@ -141,7 +140,7 @@ public void shouldReplaceEmptyValuesUsingFileBasedMapSetting() { @Test public void shouldNotReplaceEmptyValuesUsingFileBasedMapSetting() { - assertMorph(receiver, String.format(MORPH, "setreplace map", ""), + assertMorph(receiver, buildMorph("setreplace map", ""), i -> { i.startRecord("1"); i.literal("1", "zz"); @@ -154,4 +153,65 @@ public void shouldNotReplaceEmptyValuesUsingFileBasedMapSetting() { } ); } + + @Test + public void shouldLookupValuesInGzipFileMap() { + assertMorph(receiver, buildMorph("lookup in", "file-map-test.txt.gz", ""), + i -> { + i.startRecord("1"); + i.literal("1", "gw"); + i.literal("1", "fj"); + i.endRecord(); + }, + o -> { + o.get().startRecord("1"); + o.get().literal("1", "Germany"); + o.get().literal("1", "Fiji"); + o.get().endRecord(); + } + ); + } + + @Test + public void shouldNotLookupValuesInBlockedGzipFileMapWithoutDecompressConcatenated() { + assertMorph(receiver, buildMorph("lookup in", "file-map-test.txt.bgzf", ""), + i -> { + i.startRecord("1"); + i.literal("1", "gw"); + i.literal("1", "fj"); + i.endRecord(); + }, + o -> { + o.get().startRecord("1"); + o.get().endRecord(); + } + ); + } + + @Test + public void shouldLookupValuesInBlockedGzipFileMap() { + assertMorph(receiver, buildMorph("lookup in", "file-map-test.txt.bgzf", "decompressConcatenated=\"true\""), + i -> { + i.startRecord("1"); + i.literal("1", "gw"); + i.literal("1", "fj"); + i.endRecord(); + }, + o -> { + o.get().startRecord("1"); + o.get().literal("1", "Germany"); + o.get().literal("1", "Fiji"); + o.get().endRecord(); + } + ); + } + + private String buildMorph(final String data, final String options) { + return buildMorph(data, "file-map-test.txt", options); + } + + private String buildMorph(final String data, final String map, final String options) { + return String.format(MORPH, data, map, options); + } + } diff --git a/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt.bgzf b/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt.bgzf new file mode 100644 index 000000000..b49870fcd Binary files /dev/null and b/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt.bgzf differ diff --git a/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt.gz b/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt.gz new file mode 100644 index 000000000..ea631ec4e Binary files /dev/null and b/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt.gz differ