diff --git a/metafacture-csv/src/main/java/org/metafacture/csv/CsvEncoder.java b/metafacture-csv/src/main/java/org/metafacture/csv/CsvEncoder.java new file mode 100644 index 000000000..3723493db --- /dev/null +++ b/metafacture-csv/src/main/java/org/metafacture/csv/CsvEncoder.java @@ -0,0 +1,220 @@ +/* + * Copyright 2018-2023 Deutsche Nationalbibliothek et al + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.metafacture.csv; + +import org.metafacture.framework.FluxCommand; +import org.metafacture.framework.MetafactureException; +import org.metafacture.framework.ObjectReceiver; +import org.metafacture.framework.StreamReceiver; +import org.metafacture.framework.annotations.Description; +import org.metafacture.framework.annotations.In; +import org.metafacture.framework.annotations.Out; +import org.metafacture.framework.helpers.DefaultStreamPipe; + +import com.opencsv.CSVWriter; + +import java.io.IOException; +import java.io.StringWriter; +import java.util.ArrayList; +import java.util.List; + +/** + * A csv encoder that converts a record into a csv line (Default separator: {@value #DEFAULT_SEP}). + * + *

+ * Each record represents a row. Each literal value represents a column value. + *

+ * + * @author eberhardtj (j.eberhardt@dnb.de) + */ +@Description("Encodes each value in a record as a csv row.") +@In(StreamReceiver.class) +@Out(String.class) +@FluxCommand("encode-csv") +public class CsvEncoder extends DefaultStreamPipe> { + public static final char DEFAULT_SEP = CSVWriter.DEFAULT_SEPARATOR; + private CSVWriter csvWriter; + private StringWriter writer; + private List rowItems = new ArrayList<>(); + private boolean isFirstRecord = true; + private List header = new ArrayList<>(); + private char separator = DEFAULT_SEP; + private boolean noQuotes; + private boolean includeHeader; + private boolean includeRecordId; + + /** + * Creates an instance of {@link CsvEncoder} with a given separator. + * + * @param separator to separate columns + */ + public CsvEncoder(final String separator) { + this.separator = separator.charAt(0); + } + + /** + * Creates an instance of {@link CsvEncoder} with a given separator. + * + * @param separator to separate columns + */ + public CsvEncoder(final char separator) { + this.separator = separator; + } + + /** + * Creates an instance of {@link CsvEncoder}. The default separator is + * {@value #DEFAULT_SEP}. + */ + public CsvEncoder() { + } + + /** + * Start each line with the record ID. + * Default is to not start each line with the record ID. + * + * @param includeRecordId true if the first column should consist of the record's ID + */ + public void setIncludeRecordId(final boolean includeRecordId) { + this.includeRecordId = includeRecordId; + } + + /** + * Add first record as a column description header. + * Default is to not add a column description. + * + * @param includeHeader true if the first record should act as a CSV header, otherwise false + */ + public void setIncludeHeader(final boolean includeHeader) { + this.includeHeader = includeHeader; + } + + /** + * Set the character to separate the columns. + * The default is {@value #DEFAULT_SEP}. + * + * @param separator set the character which separates the columns + */ + public void setSeparator(final String separator) { + if (separator.length() > 1) { + throw new MetafactureException("Separator needs to be a single character."); + } + this.separator = separator.charAt(0); + } + + /** + * Set the character to separate the columns. + * The default is {@value #DEFAULT_SEP}. + * + * @param separator set the character which separates the columns + */ + public void setSeparator(final char separator) { + this.separator = separator; + } + + /** + * Set if values should be not quoted by '"'. + * The default is to quote values. + * + * @param noQuotes true if no quotes should be used. Default is false. + */ + public void setNoQuotes(final boolean noQuotes) { + this.noQuotes = noQuotes; + } + + private void initialize() { + writer = new StringWriter(); + final String emptyLineEnd = ""; + csvWriter = new CSVWriter(writer, separator, + noQuotes ? CSVWriter.NO_QUOTE_CHARACTER : CSVWriter.DEFAULT_QUOTE_CHARACTER, + CSVWriter.DEFAULT_ESCAPE_CHARACTER, emptyLineEnd); + } + + private String[] arrayOf(final List list) { + final int length = list.size(); + return list.toArray(new String[length]); + } + + private void resetCaches() { + this.rowItems = new ArrayList<>(); + } + + private void writeRow(final List rowItemsArray) { + final String[] row = arrayOf(rowItemsArray); + csvWriter.writeNext(row); + final String line = writer.toString(); + getReceiver().process(line); + writer.getBuffer().setLength(0); + } + + @Override + public void startRecord(final String identifier) { + if (isFirstRecord) { + initialize(); + if (includeRecordId) { + header.add("record id"); + } + } + + rowItems = new ArrayList<>(); + + if (includeRecordId) { + rowItems.add(identifier); + } + } + + @Override + public void endRecord() { + if (isFirstRecord) { + if (includeHeader) { + writeRow(header); + header.clear(); + } + isFirstRecord = false; + } + + writeRow(rowItems); + resetCaches(); + } + + @Override + public void literal(final String name, final String value) { + if (isFirstRecord) { + header.add(name); + } + rowItems.add(value); + } + + @Override + public void onCloseStream() { + try { + csvWriter.close(); + } + catch (final IOException e) { + throw new MetafactureException(e); + } + } + + @Override + public void onResetStream() { + this.includeRecordId = false; + this.includeHeader = false; + this.header = new ArrayList<>(); + this.isFirstRecord = true; + this.rowItems = new ArrayList<>(); + } + +} diff --git a/metafacture-csv/src/main/resources/flux-commands.properties b/metafacture-csv/src/main/resources/flux-commands.properties index d51970343..8b55b5c67 100644 --- a/metafacture-csv/src/main/resources/flux-commands.properties +++ b/metafacture-csv/src/main/resources/flux-commands.properties @@ -14,3 +14,4 @@ # limitations under the License. # decode-csv org.metafacture.csv.CsvDecoder +encode-csv org.metafacture.csv.CsvEncoder diff --git a/metafacture-csv/src/test/java/org/metafacture/csv/CsvEncoderTest.java b/metafacture-csv/src/test/java/org/metafacture/csv/CsvEncoderTest.java new file mode 100644 index 000000000..2356d1f43 --- /dev/null +++ b/metafacture-csv/src/test/java/org/metafacture/csv/CsvEncoderTest.java @@ -0,0 +1,191 @@ +/* + * Copyright 2018-2023 Deutsche Nationalbibliothek et al + * + * Licensed under the Apache License, Version 2.0 the "License"; + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.metafacture.csv; + +import org.junit.Rule; +import org.metafacture.framework.ObjectReceiver; + +import org.junit.Before; +import org.junit.Test; + +import org.mockito.InOrder; +import org.mockito.Mock; +import org.mockito.Mockito; +import org.mockito.MockitoAnnotations; +import org.mockito.exceptions.base.MockitoAssertionError; +import org.mockito.junit.MockitoJUnit; +import org.mockito.junit.MockitoRule; + +import java.util.Arrays; +import java.util.function.Consumer; + +/** + * Tests for {@link CsvEncoder}. + * + * @author eberhardtj (j.eberhardt@dnb.de) + * @author Pascal Christoph (dr0i) + * @author Jens Wille + */ +public final class CsvEncoderTest { + + @Mock + private ObjectReceiver receiver; + private static final String LITERAL1 = "column 1"; + private static final String LITERAL2 = "column 2"; + private static final String RECORD_ID1 = "1"; + private static final String RECORD_ID2 = "2"; + private static final String RECORD_ID3 = "3"; + private static final String VALUE1 = "a"; + private static final String VALUE2 = "b"; + private static final String VALUE3 = "c"; + private static final String VALUE4 = "d"; + private static final String VALUE5 = "e"; + private static final String VALUE6 = "f"; + + @Rule + public MockitoRule rule = MockitoJUnit.rule(); + + @Test + public void shouldReceiveSingleRecord() { + assertEncode(i -> { + i.startRecord(RECORD_ID1); + i.literal(LITERAL1, VALUE1); + i.literal(LITERAL2, VALUE2); + i.endRecord(); + }, "\"a\",\"b\""); + } + + @Test + public void shouldHaveNoQuotes() { + assertEncode(i -> { + i.setNoQuotes(true); + i.startRecord(RECORD_ID1); + i.literal(LITERAL1, VALUE1); + i.literal(LITERAL2, VALUE2); + i.endRecord(); + }, "a,b"); + } + + @Test + public void shouldReceiveSingleRecordWithHeader() { + assertEncode(i -> { + i.setIncludeHeader(true); + i.startRecord(RECORD_ID1); + i.literal(LITERAL1, VALUE1); + i.literal(LITERAL2, VALUE2); + i.endRecord(); + }, "\"column 1\",\"column 2\"", "\"a\",\"b\""); + } + + @Test + public void shouldReceiveSingleRecordWithRecordId() { + assertEncode(i -> { + i.setIncludeRecordId(true); + i.startRecord(RECORD_ID1); + i.literal(LITERAL1, VALUE1); + i.literal(LITERAL2, VALUE2); + i.endRecord(); + }, "\"1\",\"a\",\"b\""); + } + + @Test + public void shouldReceiveSingleRecordWithRecordIdAndHeader() { + assertEncode(i -> { + i.setIncludeRecordId(true); + i.setIncludeHeader(true); + i.startRecord(RECORD_ID1); + i.literal(LITERAL1, VALUE1); + i.literal(LITERAL2, VALUE2); + i.endRecord(); + }, "\"record id\",\"column 1\",\"column 2\"", "\"1\",\"a\",\"b\""); + } + + @Test + public void shouldReceiveThreeRows() { + assertEncode(i -> { + i.startRecord(RECORD_ID1); + i.literal(LITERAL1, VALUE1); + i.literal(LITERAL2, VALUE2); + i.endRecord(); + i.startRecord(RECORD_ID2); + i.literal(LITERAL1, VALUE3); + i.literal(LITERAL2, VALUE4); + i.endRecord(); + i.startRecord(RECORD_ID3); + i.literal(LITERAL1, VALUE5); + i.literal(LITERAL2, VALUE6); + i.endRecord(); + }, "\"a\",\"b\"", "\"c\",\"d\"", "\"e\",\"f\""); + } + + @Test + public void shouldUseTabulatorAsSeparator() { + assertEncode(i -> { + i.setSeparator('\t'); + i.startRecord(RECORD_ID1); + i.literal(LITERAL1, VALUE1); + i.literal(LITERAL2, VALUE2); + i.endRecord(); + }, "\"a\"\t\"b\""); + } + + @Test + public void shouldNotCreateNestedCsvInColumn() { + assertEncode(i -> { + i.startRecord(RECORD_ID1); + i.literal(LITERAL1, VALUE1); + i.literal(LITERAL2, VALUE2); + i.literal(LITERAL2, VALUE3); + i.literal(LITERAL2, VALUE4); + i.endRecord(); + }, "\"a\",\"b\",\"c\",\"d\""); + } + + @Test + public void shouldRepeatHeaderForRepeatedColumns() { + assertEncode(i -> { + i.setIncludeHeader(true); + i.startRecord(RECORD_ID1); + i.literal(LITERAL1, VALUE1); + i.literal(LITERAL2, VALUE2); + i.literal(LITERAL2, VALUE3); + i.literal(LITERAL1, VALUE4); + i.literal(LITERAL2, VALUE5); + i.endRecord(); + }, "\"column 1\",\"column 2\",\"column 2\",\"column 1\",\"column 2\"", "\"a\",\"b\",\"c\",\"d\",\"e\""); + } + + private void assertEncode(final Consumer in, final String... out) { + final InOrder ordered = Mockito.inOrder(receiver); + + final CsvEncoder csvEncoder = new CsvEncoder(); + csvEncoder.setReceiver(receiver); + in.accept(csvEncoder); + + try { + Arrays.stream(out).forEach(s -> ordered.verify(receiver).process(s)); + + ordered.verifyNoMoreInteractions(); + Mockito.verifyNoMoreInteractions(receiver); + } + catch (final MockitoAssertionError e) { + System.out.println(Mockito.mockingDetails(receiver).printInvocations()); + throw e; + } + + } +}