Skip to content

Commit

Permalink
[Enhancement] Support read compressed csv in files() (backport #54626) (
Browse files Browse the repository at this point in the history
#54665)

Co-authored-by: wyb <[email protected]>
  • Loading branch information
mergify[bot] and wyb authored Jan 3, 2025
1 parent 6f315d3 commit 767dab5
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 0 deletions.
5 changes: 5 additions & 0 deletions be/src/exec/file_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,11 @@ Status FileScanner::sample_schema(RuntimeState* state, const TBrokerScanRange& s
break;

case TFileFormatType::FORMAT_CSV_PLAIN:
case TFileFormatType::FORMAT_CSV_GZ:
case TFileFormatType::FORMAT_CSV_BZ2:
case TFileFormatType::FORMAT_CSV_LZ4_FRAME:
case TFileFormatType::FORMAT_CSV_DEFLATE:
case TFileFormatType::FORMAT_CSV_ZSTD:
p_scanner = std::make_unique<CSVScanner>(state, &profile, sample_range, &counter, true);
break;

Expand Down
80 changes: 80 additions & 0 deletions test/sql/test_files/R/test_csv_compress
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
-- name: test_csv_compress

create database db_${uuid0};
use db_${uuid0};

shell: ossutil64 mkdir oss://${oss_bucket}/test_files/csv_format/${uuid0} >/dev/null || echo "exit 0" >/dev/null

shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.gz oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"
-- result:
0

Succeed: Total num: 1, size: 80. OK num: 1(upload 1 files).
-- !result

shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.bz2 oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"
-- result:
0

Succeed: Total num: 1, size: 88. OK num: 1(upload 1 files).
-- !result

shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.lz4 oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"
-- result:
0

Succeed: Total num: 1, size: 71. OK num: 1(upload 1 files).
-- !result

shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.deflate oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"
-- result:
0

Succeed: Total num: 1, size: 57. OK num: 1(upload 1 files).
-- !result

shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.zst oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"
-- result:
0

Succeed: Total num: 1, size: 65. OK num: 1(upload 1 files).
-- !result


select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.gz", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");
-- result:
1 Julia 20.2 1
2 Andy 21.3 0
3 Joke 22.4 1
-- !result

select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.bz2", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");
-- result:
1 Julia 20.2 1
2 Andy 21.3 0
3 Joke 22.4 1
-- !result

select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.lz4", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");
-- result:
1 Julia 20.2 1
2 Andy 21.3 0
3 Joke 22.4 1
-- !result

select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.deflate", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");
-- result:
1 Julia 20.2 1
2 Andy 21.3 0
3 Joke 22.4 1
-- !result

select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.zst", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");
-- result:
1 Julia 20.2 1
2 Andy 21.3 0
3 Joke 22.4 1
-- !result


shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/csv_format/${uuid0}/ > /dev/null
20 changes: 20 additions & 0 deletions test/sql/test_files/T/test_csv_compress
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
-- name: test_csv_compress

create database db_${uuid0};
use db_${uuid0};

shell: ossutil64 mkdir oss://${oss_bucket}/test_files/csv_format/${uuid0} >/dev/null || echo "exit 0" >/dev/null

shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.gz oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"
shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.bz2 oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"
shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.lz4 oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"
shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.deflate oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"
shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.zst oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)"

select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.gz", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");
select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.bz2", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");
select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.lz4", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");
select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.deflate", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");
select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.zst", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n");

shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/csv_format/${uuid0}/ > /dev/null
Binary file added test/sql/test_files/csv_format/basic1.csv.bz2
Binary file not shown.
Binary file added test/sql/test_files/csv_format/basic1.csv.deflate
Binary file not shown.
Binary file added test/sql/test_files/csv_format/basic1.csv.gz
Binary file not shown.
Binary file added test/sql/test_files/csv_format/basic1.csv.lz4
Binary file not shown.
Binary file added test/sql/test_files/csv_format/basic1.csv.zst
Binary file not shown.

0 comments on commit 767dab5

Please sign in to comment.