From 767dab5e989292f8cf902046e52eeff5c9e2df33 Mon Sep 17 00:00:00 2001 From: "mergify[bot]" <37929162+mergify[bot]@users.noreply.github.com> Date: Fri, 3 Jan 2025 07:26:33 +0000 Subject: [PATCH] [Enhancement] Support read compressed csv in files() (backport #54626) (#54665) Co-authored-by: wyb --- be/src/exec/file_scanner.cpp | 5 ++ test/sql/test_files/R/test_csv_compress | 80 ++++++++++++++++++ test/sql/test_files/T/test_csv_compress | 20 +++++ test/sql/test_files/csv_format/basic1.csv.bz2 | Bin 0 -> 88 bytes .../test_files/csv_format/basic1.csv.deflate | Bin 0 -> 57 bytes test/sql/test_files/csv_format/basic1.csv.gz | Bin 0 -> 80 bytes test/sql/test_files/csv_format/basic1.csv.lz4 | Bin 0 -> 71 bytes test/sql/test_files/csv_format/basic1.csv.zst | Bin 0 -> 65 bytes 8 files changed, 105 insertions(+) create mode 100644 test/sql/test_files/R/test_csv_compress create mode 100644 test/sql/test_files/T/test_csv_compress create mode 100644 test/sql/test_files/csv_format/basic1.csv.bz2 create mode 100644 test/sql/test_files/csv_format/basic1.csv.deflate create mode 100644 test/sql/test_files/csv_format/basic1.csv.gz create mode 100644 test/sql/test_files/csv_format/basic1.csv.lz4 create mode 100644 test/sql/test_files/csv_format/basic1.csv.zst diff --git a/be/src/exec/file_scanner.cpp b/be/src/exec/file_scanner.cpp index 368b023553ca8..290458bb27de6 100644 --- a/be/src/exec/file_scanner.cpp +++ b/be/src/exec/file_scanner.cpp @@ -447,6 +447,11 @@ Status FileScanner::sample_schema(RuntimeState* state, const TBrokerScanRange& s break; case TFileFormatType::FORMAT_CSV_PLAIN: + case TFileFormatType::FORMAT_CSV_GZ: + case TFileFormatType::FORMAT_CSV_BZ2: + case TFileFormatType::FORMAT_CSV_LZ4_FRAME: + case TFileFormatType::FORMAT_CSV_DEFLATE: + case TFileFormatType::FORMAT_CSV_ZSTD: p_scanner = std::make_unique(state, &profile, sample_range, &counter, true); break; diff --git a/test/sql/test_files/R/test_csv_compress b/test/sql/test_files/R/test_csv_compress new file mode 100644 index 0000000000000..364ddb462be6d --- /dev/null +++ b/test/sql/test_files/R/test_csv_compress @@ -0,0 +1,80 @@ +-- name: test_csv_compress + +create database db_${uuid0}; +use db_${uuid0}; + +shell: ossutil64 mkdir oss://${oss_bucket}/test_files/csv_format/${uuid0} >/dev/null || echo "exit 0" >/dev/null + +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.gz oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" +-- result: +0 + +Succeed: Total num: 1, size: 80. OK num: 1(upload 1 files). +-- !result + +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.bz2 oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" +-- result: +0 + +Succeed: Total num: 1, size: 88. OK num: 1(upload 1 files). +-- !result + +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.lz4 oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" +-- result: +0 + +Succeed: Total num: 1, size: 71. OK num: 1(upload 1 files). +-- !result + +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.deflate oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" +-- result: +0 + +Succeed: Total num: 1, size: 57. OK num: 1(upload 1 files). +-- !result + +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.zst oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" +-- result: +0 + +Succeed: Total num: 1, size: 65. OK num: 1(upload 1 files). +-- !result + + +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.gz", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); +-- result: +1 Julia 20.2 1 +2 Andy 21.3 0 +3 Joke 22.4 1 +-- !result + +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.bz2", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); +-- result: +1 Julia 20.2 1 +2 Andy 21.3 0 +3 Joke 22.4 1 +-- !result + +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.lz4", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); +-- result: +1 Julia 20.2 1 +2 Andy 21.3 0 +3 Joke 22.4 1 +-- !result + +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.deflate", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); +-- result: +1 Julia 20.2 1 +2 Andy 21.3 0 +3 Joke 22.4 1 +-- !result + +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.zst", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); +-- result: +1 Julia 20.2 1 +2 Andy 21.3 0 +3 Joke 22.4 1 +-- !result + + +shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/csv_format/${uuid0}/ > /dev/null diff --git a/test/sql/test_files/T/test_csv_compress b/test/sql/test_files/T/test_csv_compress new file mode 100644 index 0000000000000..5aa5dc952b21d --- /dev/null +++ b/test/sql/test_files/T/test_csv_compress @@ -0,0 +1,20 @@ +-- name: test_csv_compress + +create database db_${uuid0}; +use db_${uuid0}; + +shell: ossutil64 mkdir oss://${oss_bucket}/test_files/csv_format/${uuid0} >/dev/null || echo "exit 0" >/dev/null + +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.gz oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.bz2 oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.lz4 oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.deflate oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" +shell: ossutil64 cp --force ./sql/test_files/csv_format/basic1.csv.zst oss://${oss_bucket}/test_files/csv_format/${uuid0}/ | grep -Pv "(average|elapsed)" + +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.gz", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.bz2", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.lz4", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.deflate", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); +select * from files("path" = "oss://${oss_bucket}/test_files/csv_format/${uuid0}/basic1.csv.zst", "format" = "csv", "csv.column_separator" = ",", "csv.row_delimiter" = "\n"); + +shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/csv_format/${uuid0}/ > /dev/null diff --git a/test/sql/test_files/csv_format/basic1.csv.bz2 b/test/sql/test_files/csv_format/basic1.csv.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..6779a51108316bba3ab8bb567e7b09650d08e43d GIT binary patch literal 88 zcmV-e0H^;#T4*^jL0KkKS)zVMg8%>r-GBfP00n#iAP^@lo**CqNNPPK(={4s#D1ty uXwjg^4LKYmO4cRbOg4=iycvOo?}heE_3L9gLVJWHzAog7aG@bY{EY_wSRpt7 literal 0 HcmV?d00001 diff --git a/test/sql/test_files/csv_format/basic1.csv.deflate b/test/sql/test_files/csv_format/basic1.csv.deflate new file mode 100644 index 0000000000000000000000000000000000000000..b3faee23ef0f293b86460af0539ec4323dba456c GIT binary patch literal 57 zcmb=JV|?YK*4dLjR}76V7+=!V)bf35bgA%+_e!9^72|83-e)xZo*G^Ic=rU((dn@_lM_sql>VN}#|M k<7=MYXEgnu8eRK%>b&1oL!(Q^jM|J0zFT?ZO&Ay$03V+p%>V!Z literal 0 HcmV?d00001 diff --git a/test/sql/test_files/csv_format/basic1.csv.lz4 b/test/sql/test_files/csv_format/basic1.csv.lz4 new file mode 100644 index 0000000000000000000000000000000000000000..13f3388888ed98d8979efcdfeb0d5b2a9cb1eb50 GIT binary patch literal 71 zcmZQk@|8$&SZ>0=&|s+JRhpBTsAFWHXQWe7RGP|Vq~n;EQmJEPsAsH`mY7qV%4MwM Wm7kreV`QXf0#?ER1Yrw*FaQ9pVH1%6 literal 0 HcmV?d00001 diff --git a/test/sql/test_files/csv_format/basic1.csv.zst b/test/sql/test_files/csv_format/basic1.csv.zst new file mode 100644 index 0000000000000000000000000000000000000000..76646d7a181e2a33d3902c7b625d5c5f2b427301 GIT binary patch literal 65 zcmdPcs{dETWFaGip^jH+PG+Kxk%69(PDxQ|DwmOtV_r(7j*+3Bu})fIPH`%iv5r@M RcB+n%k)8=y$>h%&vjK*n6r=zE literal 0 HcmV?d00001