[#228] HTML table are now being processed directly, without conversio…

…n to csv
kbss-cvut · Dec 18, 2024 · 339d972 · 339d972
1 parent 509aa12
commit 339d972
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 114 deletions.
diff --git a/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java b/s-pipes-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/TabularModule.java
@@ -22,14 +22,7 @@
 import cz.cvut.spipes.registry.StreamResourceRegistry;
 import cz.cvut.spipes.util.JenaUtils;
 import org.apache.commons.cli.MissingArgumentException;
-import org.apache.commons.lang3.ObjectUtils;
 import org.apache.jena.rdf.model.*;
-import org.apache.poi.hssf.usermodel.HSSFWorkbook;
-import org.apache.poi.ss.usermodel.DataFormatter;
-import org.apache.poi.ss.usermodel.Sheet;
-import org.apache.poi.ss.usermodel.Workbook;
-import org.apache.poi.ss.util.CellRangeAddress;
-import org.apache.poi.xssf.usermodel.XSSFWorkbook;
 import org.jetbrains.annotations.NotNull;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -46,7 +39,6 @@
 import java.nio.charset.StandardCharsets;
 import java.util.*;
 import java.util.function.Supplier;
-import java.util.stream.StreamSupport;
 
 /**
  * Module for converting input that contains tabular data (e.g. CSV, TSV, XLS, HTML) to RDF
@@ -208,16 +200,7 @@ ExecutionContext executeSelf() {
                 if (processTableAtIndex != 1) {
                     throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet.");
                 }
-                tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
-                table.setLabel(tsvConvertor.getTableName(sourceResource));
-                setSourceResource(tsvConvertor.convertToTSV(sourceResource));
-                setDelimiter('\t');
-
-                csvPreference = new CsvPreference.Builder(
-                        quoteCharacter,
-                        delimiter,
-                        System.lineSeparator()).build();
-                fileReaderAdapter = new HTMLFileReaderAdapter(csvPreference);
+                fileReaderAdapter = new HTMLFileReaderAdapter();
                 break;
             case XLS:
             case XLSM:
@@ -248,7 +231,7 @@ ExecutionContext executeSelf() {
         List<Statement> rowStatements = new ArrayList<>();
 
         try {
-            fileReaderAdapter.initialise(sourceResource, sourceResourceFormat, processTableAtIndex);
+            fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
             String[] header = fileReaderAdapter.getHeader();
             Set<String> columnNames = new HashSet<>();
 
@@ -265,11 +248,10 @@ ExecutionContext executeSelf() {
 
             if (skipHeader) {
                 header = getHeaderFromSchema(inputModel, header, hasInputSchema);
-                fileReaderAdapter.initialise(sourceResource, sourceResourceFormat, processTableAtIndex);
+                fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
             } else if (hasInputSchema) {
                 header = getHeaderFromSchema(inputModel, header, true);
             }
-
             em.getTransaction().commit();
             em.close();
             em.getEntityManagerFactory().close();
@@ -300,7 +282,6 @@ ExecutionContext executeSelf() {
                     break;
             }
             while ((row = fileReaderAdapter.getNextRow()) != null) {
-                //row = fileReaderAdapter.getNextRow();
                 rowNumber++;
                 // 4.6.1 and 4.6.3
                 Row r = new Row();
@@ -346,7 +327,7 @@ ExecutionContext executeSelf() {
             em.persist(tableGroup);
             em.merge(tableSchema);
 
-            List<Region> regions = fileReaderAdapter.getMergedRegions(originalSourceResource);
+            List<Region> regions = fileReaderAdapter.getMergedRegions();
 
             int cellsNum = 1;
             for (Region region : regions) {

diff --git a/...odules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVFileReaderAdapter.java b/...odules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/CSVFileReaderAdapter.java
@@ -2,17 +2,14 @@
 
 import cz.cvut.spipes.modules.ResourceFormat;
 import cz.cvut.spipes.modules.model.Region;
-import cz.cvut.spipes.registry.StreamResource;
 import org.supercsv.io.CsvListReader;
 import org.supercsv.io.ICsvListReader;
 import org.supercsv.prefs.CsvPreference;
 
-import java.io.ByteArrayInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.List;
 
 public class CSVFileReaderAdapter implements FileReaderAdapter {
@@ -24,8 +21,8 @@ public CSVFileReaderAdapter(CsvPreference csvPreference) {
     }
 
     @Override
-    public void initialise(StreamResource sourceResource, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
-        listReader = new CsvListReader(new InputStreamReader(new ByteArrayInputStream(sourceResource.getContent())), csvPreference);
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
+        listReader = new CsvListReader(new InputStreamReader(inputStream), csvPreference);
     }
 
     @Override
@@ -44,7 +41,7 @@ public List<String> getNextRow() throws IOException {
     }
 
     @Override
-    public List<Region> getMergedRegions(StreamResource sourceResource) {
+    public List<Region> getMergedRegions() {
          return new ArrayList<>();
     }
 

diff --git a/...s-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/FileReaderAdapter.java b/...s-modules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/FileReaderAdapter.java
@@ -1,18 +1,21 @@
 package cz.cvut.spipes.modules.util;
 
 import cz.cvut.spipes.modules.ResourceFormat;
+import cz.cvut.spipes.modules.TabularModule;
 import cz.cvut.spipes.modules.model.Region;
-import cz.cvut.spipes.registry.StreamResource;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.io.InputStream;
 import java.util.List;
 
 public interface FileReaderAdapter {
-    void initialise(StreamResource sourceResource, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException;
+    static final Logger LOG = LoggerFactory.getLogger(TabularModule.class);
+    void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException;
     String[] getHeader() throws IOException;
     boolean hasNext() throws IOException;
     List<String> getNextRow() throws IOException;
-    List<Region> getMergedRegions(StreamResource sourceResource);
+    List<Region> getMergedRegions();
     String getLabel() throws IOException;
 }
diff --git a/...dules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLFileReaderAdapter.java b/...dules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/HTMLFileReaderAdapter.java
@@ -1,116 +1,124 @@
 package cz.cvut.spipes.modules.util;
 
-import cz.cvut.spipes.InvalidQuotingTokenizer;
-import cz.cvut.spipes.constants.HTML;
 import cz.cvut.spipes.modules.ResourceFormat;
 import cz.cvut.spipes.modules.model.Region;
-import cz.cvut.spipes.registry.StreamResource;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Document;
 import org.jsoup.nodes.Element;
 import org.jsoup.select.Elements;
-import org.supercsv.io.CsvListReader;
-import org.supercsv.io.ICsvListReader;
-import org.supercsv.prefs.CsvPreference;
 
 import java.io.*;
-import java.nio.charset.Charset;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.List;
+import java.util.*;
 
 public class HTMLFileReaderAdapter implements FileReaderAdapter {
-    private ICsvListReader listReader;
-    private CsvPreference csvPreference;
-    private TSVConvertor tsvConvertor;
+    private Elements rows;
+    private int currentIndex;
+    private Element table;
     private String label;
-    private StreamResource sourceResource;
-    private StreamResource originalSourceResource;
-    private Charset inputCharset = Charset.defaultCharset();
-    private boolean acceptInvalidQuoting = false;
 
-    public HTMLFileReaderAdapter(CsvPreference csvPreference) {
-        this.csvPreference = csvPreference;
-    }
+    private List<Region> mergedRegions;
+    private Map<Integer, Map<Integer, String>> mergedCells;
 
     @Override
-    public void initialise(StreamResource sourceResource, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
-        this.sourceResource = sourceResource;
-        tsvConvertor = new HTML2TSVConvertor(tableIndex);
-        listReader = getCsvListReader(csvPreference);
-        this.sourceResource = tsvConvertor.convertToTSV(sourceResource);
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
+        Document doc = Jsoup.parse(inputStream, "UTF-8", "");
+        Element table = doc.select("table").first();
+        rows = table.select("tr");
+        currentIndex = 0;
+        this.table = table;
+        mergedRegions = extractMergedRegions(table);
+        mergedCells = new HashMap<>();
+        label = table.attr("data-name");
     }
 
+
     @Override
     public String[] getHeader() throws IOException {
-        return listReader.getHeader(true);
+        Elements headerCells = rows.get(0).select("th, td");
+        return headerCells.stream()
+                .map(Element::text)
+                .toArray(String[]::new);
     }
 
     @Override
-    public boolean hasNext() throws IOException {
-        return listReader.read() != null;
+    public boolean hasNext() {
+        return currentIndex < rows.size() - 1; // Skip header row
     }
 
     @Override
-    public List<String> getNextRow() throws IOException {
-        return listReader.read();
-    }
+    public List<String> getNextRow() {
+        if (!hasNext()) {
+            return null;
+        }
 
-    @Override
-    public List<Region> getMergedRegions(StreamResource sourceResource) {
-        List<Region> list = new ArrayList<>();
-        Document doc = Jsoup.parseBodyFragment(new String(sourceResource.getContent()));
-
-        Elements rows = doc.getElementsByTag(HTML.TABLE_ROW_TAG);
-
-        for (Element row : rows) {
-            Elements cells = row.getElementsByTag(HTML.TABLE_HEADER_TAG);
-            cells.addAll(row.getElementsByTag(HTML.TABLE_CELL_TAG));
-            int rowNum = row.elementSiblingIndex();
-            int colNum = 0;
-            for(Element cell : cells) {
-                int colspan = parseInt(cell.attr("colspan"), 1);
-                int rowspan = parseInt(cell.attr("rowspan"), 1);
-                if (colspan > 1 || rowspan > 1) {
-                    list.add(new Region(
-                            rowNum,
-                            colNum,
-                            rowNum+rowspan-1,
-                            colNum+colspan-1)
-                    );
+        currentIndex++;
+        Elements cells = rows.get(currentIndex).select("td, th");
+        List<String> row = new ArrayList<>();
+        int cellIndex = 0;
+
+        for (Element cell : cells) {
+            int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
+            int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
+            String cellValue = cell.text();
+
+            if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) {
+                cellValue = cellValue.replace(",", ".");
+            }
+
+            while (row.size() < cellIndex) {
+                row.add(null);
+            }
+
+            row.add(cellValue);
+
+            for (int i = 1; i < colspan; i++) {
+                row.add(null);
+            }
+
+            if (rowspan > 1) {
+                for (int i = 1; i < rowspan; i++) {
+                    mergedCells.computeIfAbsent(currentIndex + i, k -> new HashMap<>()).put(cellIndex, cellValue);
                 }
-                colNum+=colspan;
             }
+
+            cellIndex += colspan;
+        }
+
+        if (mergedCells.containsKey(currentIndex)) {
+            Map<Integer, String> rowMergedCells = mergedCells.get(currentIndex);
+            for (Map.Entry<Integer, String> entry : rowMergedCells.entrySet()) {
+                row.add(entry.getKey(), null);
+            }
+            mergedCells.remove(currentIndex);
         }
-        return list;
+
+        return row;
     }
 
     @Override
-    public String getLabel(){
-        return this.label;
+    public List<Region> getMergedRegions() {
+        return mergedRegions;
     }
 
-    private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
-        if (acceptInvalidQuoting) {
-            if (csvPreference.getQuoteChar() == '\0') {
-                return null;
-            } else
-                return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
+    private List<Region> extractMergedRegions(Element table) {
+        List<Region> regions = new ArrayList<>();
+        Elements rows = table.select("tr");
+        for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
+            Elements cells = rows.get(rowIndex).select("td, th");
+            for (int colIndex = 0; colIndex < cells.size(); colIndex++) {
+                Element cell = cells.get(colIndex);
+                int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
+                int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
+                if (colspan > 1 || rowspan > 1) {
+                    regions.add(new Region(rowIndex, colIndex, rowIndex + rowspan - 1, colIndex + colspan - 1));
+                }
+            }
         }
-        return new CsvListReader(getReader(), csvPreference);
-    }
-
-    private Reader getReader() {
-        return new StringReader(new String(sourceResource.getContent(), inputCharset));
+        return regions;
     }
 
-    int parseInt(String s,int defaultValue){
-        int res = 0;
-        try {
-            res = Integer.parseInt(s);
-        } catch (java.lang.NumberFormatException e){
-            res = defaultValue;
-        }
-        return res;
+    @Override
+    public String getLabel(){
+        return label;
     }
 }
diff --git a/...odules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSFileReaderAdapter.java b/...odules/module-tabular/src/main/java/cz/cvut/spipes/modules/util/XLSFileReaderAdapter.java
@@ -1,11 +1,10 @@
 package cz.cvut.spipes.modules.util;
 
 import cz.cvut.spipes.modules.ResourceFormat;
+import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
 import cz.cvut.spipes.modules.model.Region;
-import cz.cvut.spipes.registry.StreamResource;
 import org.apache.poi.hssf.usermodel.HSSFWorkbook;
 import org.apache.poi.ss.usermodel.DataFormatter;
-import org.apache.poi.ss.usermodel.Row;
 import org.apache.poi.ss.usermodel.Sheet;
 import org.apache.poi.ss.usermodel.Workbook;
 import org.apache.poi.ss.util.CellRangeAddress;
@@ -25,13 +24,20 @@ public class XLSFileReaderAdapter implements FileReaderAdapter {
     private Iterator<org.apache.poi.ss.usermodel.Row> rowIterator;
 
     @Override
-    public void initialise(StreamResource sourceResource, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
+    public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
         Workbook workbook;
         if (sourceResourceFormat == ResourceFormat.XLS) {
-            workbook = new HSSFWorkbook(new ByteArrayInputStream(sourceResource.getContent()));
+            workbook = new HSSFWorkbook(inputStream);
         } else {
-            workbook = new XSSFWorkbook(new ByteArrayInputStream(sourceResource.getContent()));
+            workbook = new XSSFWorkbook(inputStream);
         }
+        if ((tableIndex > workbook.getNumberOfSheets()) || (tableIndex < 1)) {
+                    LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
+                            workbook.getNumberOfSheets(),
+                            tableIndex
+                    );
+                    throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
+                }
         sheet = workbook.getSheetAt(tableIndex - 1);
         rowIterator = sheet.iterator();
     }
@@ -68,7 +74,7 @@ public List<String> getNextRow() {
     }
 
     @Override
-    public List<Region> getMergedRegions(StreamResource sourceResource) {
+    public List<Region> getMergedRegions() {
         List<Region> regions = new ArrayList<>();
         for (int i = 0; i < sheet.getNumMergedRegions(); i++) {
             CellRangeAddress region = sheet.getMergedRegion(i);