Skip to content

Commit

Permalink
Fix reading html from InputStream
Browse files Browse the repository at this point in the history
  • Loading branch information
benmccann committed Mar 24, 2019
1 parent 5a156c2 commit d410a90
Showing 1 changed file with 11 additions and 3 deletions.
14 changes: 11 additions & 3 deletions html/src/main/java/tech/tablesaw/io/html/HtmlReader.java
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
package tech.tablesaw.io.html;

import java.io.IOException;
import java.io.Reader;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Stream;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.parser.Parser;
Expand Down Expand Up @@ -33,8 +34,15 @@ public static void register(ReaderRegistry registry) {

@Override
public Table read(HtmlReadOptions options) throws IOException {
Reader reader = options.source().createReader(null);
Document doc = Parser.htmlParser().parseInput(reader, "");
Document doc;
InputStream inputStream = options.source().inputStream();
if (inputStream != null) {
// Reader must support mark, so can't use InputStreamReader
// Parse the InputStream directly
doc = Jsoup.parse(inputStream, null, "");
} else {
doc = Parser.htmlParser().parseInput(options.source().createReader(null), "");
}
Elements tables = doc.select("table");
if (tables.size() != 1) {
throw new IllegalStateException(
Expand Down

0 comments on commit d410a90

Please sign in to comment.