Skip to content

Commit

Permalink
[#228] HTML table are now being processed directly, without conversio…
Browse files Browse the repository at this point in the history
…n to csv
  • Loading branch information
Evgenii Grigorev committed Dec 18, 2024
1 parent 509aa12 commit 339d972
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 114 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,7 @@
import cz.cvut.spipes.registry.StreamResourceRegistry;
import cz.cvut.spipes.util.JenaUtils;
import org.apache.commons.cli.MissingArgumentException;
import org.apache.commons.lang3.ObjectUtils;
import org.apache.jena.rdf.model.*;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.CellRangeAddress;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand All @@ -46,7 +39,6 @@
import java.nio.charset.StandardCharsets;
import java.util.*;
import java.util.function.Supplier;
import java.util.stream.StreamSupport;

/**
* Module for converting input that contains tabular data (e.g. CSV, TSV, XLS, HTML) to RDF
Expand Down Expand Up @@ -208,16 +200,7 @@ ExecutionContext executeSelf() {
if (processTableAtIndex != 1) {
throw new UnsupportedOperationException("Support for 'process-table-at-index' different from 1 is not implemented for HTML files yet.");
}
tsvConvertor = new HTML2TSVConvertor(processTableAtIndex);
table.setLabel(tsvConvertor.getTableName(sourceResource));
setSourceResource(tsvConvertor.convertToTSV(sourceResource));
setDelimiter('\t');

csvPreference = new CsvPreference.Builder(
quoteCharacter,
delimiter,
System.lineSeparator()).build();
fileReaderAdapter = new HTMLFileReaderAdapter(csvPreference);
fileReaderAdapter = new HTMLFileReaderAdapter();
break;
case XLS:
case XLSM:
Expand Down Expand Up @@ -248,7 +231,7 @@ ExecutionContext executeSelf() {
List<Statement> rowStatements = new ArrayList<>();

try {
fileReaderAdapter.initialise(sourceResource, sourceResourceFormat, processTableAtIndex);
fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
String[] header = fileReaderAdapter.getHeader();
Set<String> columnNames = new HashSet<>();

Expand All @@ -265,11 +248,10 @@ ExecutionContext executeSelf() {

if (skipHeader) {
header = getHeaderFromSchema(inputModel, header, hasInputSchema);
fileReaderAdapter.initialise(sourceResource, sourceResourceFormat, processTableAtIndex);
fileReaderAdapter.initialise(new ByteArrayInputStream(sourceResource.getContent()), sourceResourceFormat, processTableAtIndex);
} else if (hasInputSchema) {
header = getHeaderFromSchema(inputModel, header, true);
}

em.getTransaction().commit();
em.close();
em.getEntityManagerFactory().close();
Expand Down Expand Up @@ -300,7 +282,6 @@ ExecutionContext executeSelf() {
break;
}
while ((row = fileReaderAdapter.getNextRow()) != null) {
//row = fileReaderAdapter.getNextRow();
rowNumber++;
// 4.6.1 and 4.6.3
Row r = new Row();
Expand Down Expand Up @@ -346,7 +327,7 @@ ExecutionContext executeSelf() {
em.persist(tableGroup);
em.merge(tableSchema);

List<Region> regions = fileReaderAdapter.getMergedRegions(originalSourceResource);
List<Region> regions = fileReaderAdapter.getMergedRegions();

int cellsNum = 1;
for (Region region : regions) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,14 @@

import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.model.Region;
import cz.cvut.spipes.registry.StreamResource;
import org.supercsv.io.CsvListReader;
import org.supercsv.io.ICsvListReader;
import org.supercsv.prefs.CsvPreference;

import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

public class CSVFileReaderAdapter implements FileReaderAdapter {
Expand All @@ -24,8 +21,8 @@ public CSVFileReaderAdapter(CsvPreference csvPreference) {
}

@Override
public void initialise(StreamResource sourceResource, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
listReader = new CsvListReader(new InputStreamReader(new ByteArrayInputStream(sourceResource.getContent())), csvPreference);
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
listReader = new CsvListReader(new InputStreamReader(inputStream), csvPreference);
}

@Override
Expand All @@ -44,7 +41,7 @@ public List<String> getNextRow() throws IOException {
}

@Override
public List<Region> getMergedRegions(StreamResource sourceResource) {
public List<Region> getMergedRegions() {
return new ArrayList<>();
}

Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
package cz.cvut.spipes.modules.util;

import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.TabularModule;
import cz.cvut.spipes.modules.model.Region;
import cz.cvut.spipes.registry.StreamResource;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.IOException;
import java.io.InputStream;
import java.util.List;

public interface FileReaderAdapter {
void initialise(StreamResource sourceResource, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException;
static final Logger LOG = LoggerFactory.getLogger(TabularModule.class);
void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException;
String[] getHeader() throws IOException;
boolean hasNext() throws IOException;
List<String> getNextRow() throws IOException;
List<Region> getMergedRegions(StreamResource sourceResource);
List<Region> getMergedRegions();
String getLabel() throws IOException;
}
Original file line number Diff line number Diff line change
@@ -1,116 +1,124 @@
package cz.cvut.spipes.modules.util;

import cz.cvut.spipes.InvalidQuotingTokenizer;
import cz.cvut.spipes.constants.HTML;
import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.model.Region;
import cz.cvut.spipes.registry.StreamResource;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.supercsv.io.CsvListReader;
import org.supercsv.io.ICsvListReader;
import org.supercsv.prefs.CsvPreference;

import java.io.*;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.*;

public class HTMLFileReaderAdapter implements FileReaderAdapter {
private ICsvListReader listReader;
private CsvPreference csvPreference;
private TSVConvertor tsvConvertor;
private Elements rows;
private int currentIndex;
private Element table;
private String label;
private StreamResource sourceResource;
private StreamResource originalSourceResource;
private Charset inputCharset = Charset.defaultCharset();
private boolean acceptInvalidQuoting = false;

public HTMLFileReaderAdapter(CsvPreference csvPreference) {
this.csvPreference = csvPreference;
}
private List<Region> mergedRegions;
private Map<Integer, Map<Integer, String>> mergedCells;

@Override
public void initialise(StreamResource sourceResource, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
this.sourceResource = sourceResource;
tsvConvertor = new HTML2TSVConvertor(tableIndex);
listReader = getCsvListReader(csvPreference);
this.sourceResource = tsvConvertor.convertToTSV(sourceResource);
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
Document doc = Jsoup.parse(inputStream, "UTF-8", "");
Element table = doc.select("table").first();
rows = table.select("tr");
currentIndex = 0;
this.table = table;
mergedRegions = extractMergedRegions(table);
mergedCells = new HashMap<>();
label = table.attr("data-name");
}


@Override
public String[] getHeader() throws IOException {
return listReader.getHeader(true);
Elements headerCells = rows.get(0).select("th, td");
return headerCells.stream()
.map(Element::text)
.toArray(String[]::new);
}

@Override
public boolean hasNext() throws IOException {
return listReader.read() != null;
public boolean hasNext() {
return currentIndex < rows.size() - 1; // Skip header row
}

@Override
public List<String> getNextRow() throws IOException {
return listReader.read();
}
public List<String> getNextRow() {
if (!hasNext()) {
return null;
}

@Override
public List<Region> getMergedRegions(StreamResource sourceResource) {
List<Region> list = new ArrayList<>();
Document doc = Jsoup.parseBodyFragment(new String(sourceResource.getContent()));

Elements rows = doc.getElementsByTag(HTML.TABLE_ROW_TAG);

for (Element row : rows) {
Elements cells = row.getElementsByTag(HTML.TABLE_HEADER_TAG);
cells.addAll(row.getElementsByTag(HTML.TABLE_CELL_TAG));
int rowNum = row.elementSiblingIndex();
int colNum = 0;
for(Element cell : cells) {
int colspan = parseInt(cell.attr("colspan"), 1);
int rowspan = parseInt(cell.attr("rowspan"), 1);
if (colspan > 1 || rowspan > 1) {
list.add(new Region(
rowNum,
colNum,
rowNum+rowspan-1,
colNum+colspan-1)
);
currentIndex++;
Elements cells = rows.get(currentIndex).select("td, th");
List<String> row = new ArrayList<>();
int cellIndex = 0;

for (Element cell : cells) {
int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
String cellValue = cell.text();

if (cellValue != null && cellValue.matches("[-+]?[0-9]*\\,?[0-9]+")) {
cellValue = cellValue.replace(",", ".");
}

while (row.size() < cellIndex) {
row.add(null);
}

row.add(cellValue);

for (int i = 1; i < colspan; i++) {
row.add(null);
}

if (rowspan > 1) {
for (int i = 1; i < rowspan; i++) {
mergedCells.computeIfAbsent(currentIndex + i, k -> new HashMap<>()).put(cellIndex, cellValue);
}
colNum+=colspan;
}

cellIndex += colspan;
}

if (mergedCells.containsKey(currentIndex)) {
Map<Integer, String> rowMergedCells = mergedCells.get(currentIndex);
for (Map.Entry<Integer, String> entry : rowMergedCells.entrySet()) {
row.add(entry.getKey(), null);
}
mergedCells.remove(currentIndex);
}
return list;

return row;
}

@Override
public String getLabel(){
return this.label;
public List<Region> getMergedRegions() {
return mergedRegions;
}

private ICsvListReader getCsvListReader(CsvPreference csvPreference) {
if (acceptInvalidQuoting) {
if (csvPreference.getQuoteChar() == '\0') {
return null;
} else
return new CsvListReader(new InvalidQuotingTokenizer(getReader(), csvPreference), csvPreference);
private List<Region> extractMergedRegions(Element table) {
List<Region> regions = new ArrayList<>();
Elements rows = table.select("tr");
for (int rowIndex = 0; rowIndex < rows.size(); rowIndex++) {
Elements cells = rows.get(rowIndex).select("td, th");
for (int colIndex = 0; colIndex < cells.size(); colIndex++) {
Element cell = cells.get(colIndex);
int colspan = Integer.parseInt(cell.attr("colspan").isEmpty() ? "1" : cell.attr("colspan"));
int rowspan = Integer.parseInt(cell.attr("rowspan").isEmpty() ? "1" : cell.attr("rowspan"));
if (colspan > 1 || rowspan > 1) {
regions.add(new Region(rowIndex, colIndex, rowIndex + rowspan - 1, colIndex + colspan - 1));
}
}
}
return new CsvListReader(getReader(), csvPreference);
}

private Reader getReader() {
return new StringReader(new String(sourceResource.getContent(), inputCharset));
return regions;
}

int parseInt(String s,int defaultValue){
int res = 0;
try {
res = Integer.parseInt(s);
} catch (java.lang.NumberFormatException e){
res = defaultValue;
}
return res;
@Override
public String getLabel(){
return label;
}
}
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
package cz.cvut.spipes.modules.util;

import cz.cvut.spipes.modules.ResourceFormat;
import cz.cvut.spipes.modules.exception.SheetDoesntExistsException;
import cz.cvut.spipes.modules.model.Region;
import cz.cvut.spipes.registry.StreamResource;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.DataFormatter;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.ss.util.CellRangeAddress;
Expand All @@ -25,13 +24,20 @@ public class XLSFileReaderAdapter implements FileReaderAdapter {
private Iterator<org.apache.poi.ss.usermodel.Row> rowIterator;

@Override
public void initialise(StreamResource sourceResource, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
public void initialise(InputStream inputStream, ResourceFormat sourceResourceFormat, int tableIndex) throws IOException {
Workbook workbook;
if (sourceResourceFormat == ResourceFormat.XLS) {
workbook = new HSSFWorkbook(new ByteArrayInputStream(sourceResource.getContent()));
workbook = new HSSFWorkbook(inputStream);
} else {
workbook = new XSSFWorkbook(new ByteArrayInputStream(sourceResource.getContent()));
workbook = new XSSFWorkbook(inputStream);
}
if ((tableIndex > workbook.getNumberOfSheets()) || (tableIndex < 1)) {
LOG.error("Requested sheet doesn't exist, number of sheets in the doc: {}, requested sheet: {}",
workbook.getNumberOfSheets(),
tableIndex
);
throw new SheetDoesntExistsException("Requested sheet doesn't exists.");
}
sheet = workbook.getSheetAt(tableIndex - 1);
rowIterator = sheet.iterator();
}
Expand Down Expand Up @@ -68,7 +74,7 @@ public List<String> getNextRow() {
}

@Override
public List<Region> getMergedRegions(StreamResource sourceResource) {
public List<Region> getMergedRegions() {
List<Region> regions = new ArrayList<>();
for (int i = 0; i < sheet.getNumMergedRegions(); i++) {
CellRangeAddress region = sheet.getMergedRegion(i);
Expand Down

0 comments on commit 339d972

Please sign in to comment.