diff --git a/.editorconfig b/.editorconfig index 238988221..d43b16be5 100644 --- a/.editorconfig +++ b/.editorconfig @@ -34,5 +34,8 @@ indent_size = 2 [metafacture-io/src/test/resources/org/metafacture/io/compressed.txt] insert_final_newline = false +[metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt] +trim_trailing_whitespace = false + [metafacture-runner/src/main/dist/config/java-options.conf] end_of_line = crlf diff --git a/metamorph/src/main/java/org/metafacture/metamorph/maps/FileMap.java b/metamorph/src/main/java/org/metafacture/metamorph/maps/FileMap.java index 878397fc0..1972e362a 100644 --- a/metamorph/src/main/java/org/metafacture/metamorph/maps/FileMap.java +++ b/metamorph/src/main/java/org/metafacture/metamorph/maps/FileMap.java @@ -1,5 +1,5 @@ /* - * Copyright 2013, 2014 Deutsche Nationalbibliothek + * Copyright 2013, 2014, 2021 Deutsche Nationalbibliothek et al * * Licensed under the Apache License, Version 2.0 the "License"; * you may not use this file except in compliance with the License. @@ -29,6 +29,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.nio.charset.StandardCharsets; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.Map; @@ -37,9 +38,16 @@ import java.util.regex.Pattern; /** - * Provides a {@link Map} based on a file. The file is supposed to be UTF-8 - * encoded. The default separator is {@code \t}. Important: - * Lines that are not split in two parts by the separator are ignored! + * Provides a {@link Map} based on files. Can be one file or a comma separated list of files. + * The files are supposed to be UTF-8 encoded. The default separator is {@code \t}. + * + * By setting {@link #allowEmptyValues} to {@code true} the values in the + * {@link Map} can be empty thus enabling e.g. + * {@link org.metafacture.metamorph.functions.SetReplace} to remove matching + * keys. + * + * Important: All other lines that are not split in two parts + * by the separator are ignored! * * @author Markus Michael Geipel */ @@ -48,6 +56,9 @@ public final class FileMap extends AbstractReadOnlyMap { private final Map map = new HashMap<>(); private Pattern split = Pattern.compile("\t", Pattern.LITERAL); + private boolean allowEmptyValues; + private boolean isUninitialized = true; + private ArrayList filenames = new ArrayList<>(); /** * Creates an instance of {@link FileMap}. @@ -55,27 +66,45 @@ public final class FileMap extends AbstractReadOnlyMap { public FileMap() { } + private void init() { + loadFiles(); + isUninitialized = false; + } + + /** + * Sets whether to allow empty values in the {@link Map} or ignore these + * entries. + * + * Default value: false + * + * @param allowEmptyValues true if empty values in the Map are allowed + */ + public void setAllowEmptyValues(final boolean allowEmptyValues) { + this.allowEmptyValues = allowEmptyValues; + } + /** - * Sets a comma separated list of files which are then passed to - * {@link #setFile}. + * Sets a comma separated list of files which provides the {@link Map}. * * @param files a comma separated list of files */ public void setFiles(final String files) { - final String[] parts = files.split("\\s*,\\s*"); - for (final String part : parts) { - setFile(part); - } + Collections.addAll(filenames, files.split("\\s*,\\s*")); } /** - * Provides a {@link Map} based on a file. The file is supposed to be UTF-8 - * encoded. The default separator is {@code \t}. Important: - * Lines that are not split in two parts by the separator are ignored! - * + * Sets a file which provides the {@link Map}. * @param file the file */ public void setFile(final String file) { + Collections.addAll(filenames, file); + } + + private void loadFiles() { + filenames.forEach(this::loadFile); + } + + private void loadFile(final String file) { try ( InputStream stream = openStream(file); BufferedReader reader = new BufferedReader(new InputStreamReader(stream, StandardCharsets.UTF_8)) @@ -85,7 +114,7 @@ public void setFile(final String file) { if (line.isEmpty()) { continue; } - final String[] parts = split.split(line); + final String[] parts = allowEmptyValues ? split.split(line, -1) : split.split(line); if (parts.length == 2) { map.put(parts[0], parts[1]); } @@ -147,11 +176,17 @@ public void setSeparator(final String delimiter) { @Override public String get(final Object key) { + if (isUninitialized) { + init(); + } return map.get(key); } @Override public Set keySet() { + if (isUninitialized) { + init(); + } return Collections.unmodifiableSet(map.keySet()); } diff --git a/metamorph/src/main/resources/schemata/metamorph.xsd b/metamorph/src/main/resources/schemata/metamorph.xsd index 65ec1a0f2..7d3cc7cd0 100644 --- a/metamorph/src/main/resources/schemata/metamorph.xsd +++ b/metamorph/src/main/resources/schemata/metamorph.xsd @@ -587,7 +587,6 @@ - Lookup table defined by text files @@ -598,16 +597,21 @@ Unique name of the lookup table + + + Allow empty values in Map. + + - Filenames + Filename(s) referencing the lookup table(s). Can be one + filename or a comma separated list of filenames. - + - String used in the files to separate key from value. - + String used in the files to separate keys from values. + The default separator is the tabulator. @@ -795,7 +799,7 @@ - Relace strings based on a replacement table. + Replace strings based on a replacement table. diff --git a/metamorph/src/test/java/org/metafacture/metamorph/maps/FileMapTest.java b/metamorph/src/test/java/org/metafacture/metamorph/maps/FileMapTest.java index 388b5389c..e55b06679 100644 --- a/metamorph/src/test/java/org/metafacture/metamorph/maps/FileMapTest.java +++ b/metamorph/src/test/java/org/metafacture/metamorph/maps/FileMapTest.java @@ -47,12 +47,12 @@ public final class FileMapTest { "" + "" + " " + + "file-map-test.txt' %s/>" + ""; @Test public void shouldLookupValuesInFileBasedMap() { - assertMorph(receiver, String.format(MORPH, "lookup in"), + assertMorph(receiver, String.format(MORPH, "lookup in", ""), i -> { i.startRecord("1"); i.literal("1", "gw"); @@ -70,7 +70,7 @@ public void shouldLookupValuesInFileBasedMap() { @Test public void shouldWhitelistValuesInFileBasedMap() { - assertMorph(receiver, String.format(MORPH, "whitelist map"), + assertMorph(receiver, String.format(MORPH, "whitelist map", ""), i -> { i.startRecord("1"); i.literal("1", "gw"); @@ -89,7 +89,7 @@ public void shouldWhitelistValuesInFileBasedMap() { @Test public void shouldReplaceValuesUsingFileBasedMap() { - assertMorph(receiver, String.format(MORPH, "setreplace map"), + assertMorph(receiver, String.format(MORPH, "setreplace map", ""), i -> { i.startRecord("1"); i.literal("1", "gw-fj: 1:1"); @@ -105,4 +105,53 @@ public void shouldReplaceValuesUsingFileBasedMap() { ); } + @Test + public void shouldReplaceCommaSeparatedValuesUsingFileBasedMapSetting() { + assertMorph(receiver, String.format(MORPH, "setreplace map", "separator=\",\""), + i -> { + i.startRecord("1"); + i.literal("1", "gw"); + i.literal("1", "ry\tRyukyuIslands"); + i.endRecord(); + }, + o -> { + o.get().startRecord("1"); + o.get().literal("1", "gw"); + o.get().literal("1", "Southern"); + o.get().endRecord(); + } + ); + } + + @Test + public void shouldReplaceEmptyValuesUsingFileBasedMapSetting() { + assertMorph(receiver, String.format(MORPH, "setreplace map", "allowEmptyValues=\"true\""), + i -> { + i.startRecord("1"); + i.literal("1", "zz"); + i.endRecord(); + }, + o -> { + o.get().startRecord("1"); + o.get().literal("1", ""); + o.get().endRecord(); + } + ); + } + + @Test + public void shouldNotReplaceEmptyValuesUsingFileBasedMapSetting() { + assertMorph(receiver, String.format(MORPH, "setreplace map", ""), + i -> { + i.startRecord("1"); + i.literal("1", "zz"); + i.endRecord(); + }, + o -> { + o.get().startRecord("1"); + o.get().literal("1", "zz"); + o.get().endRecord(); + } + ); + } } diff --git a/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt b/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt index ce9843b05..800404ffe 100644 --- a/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt +++ b/metamorph/src/test/resources/org/metafacture/metamorph/maps/file-map-test.txt @@ -378,3 +378,4 @@ ykc YukonTerritory ys Yemen(People'sDemocraticRepublic) yu SerbiaandMontenegro za Zambia +zz