Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Enhancement] Improve files() parquet orc column mismatch error message (backport #54571) #54584

Merged
merged 1 commit into from
Jan 2, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion be/src/exec/csv_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ static std::string make_column_count_not_matched_error_message_for_query(int exp
<< "Column separator: " << string_2_asc(parse_options.column_delimiter) << ", "
<< "Row delimiter: " << string_2_asc(parse_options.row_delimiter) << ", "
<< "Row: '" << row << "', File: " << filename << ". "
<< "Consider setting 'fill_mismatch_column_with' = 'null'";
<< "Consider setting 'fill_mismatch_column_with' = 'null' property";
return error_msg.str();
}

Expand Down
3 changes: 3 additions & 0 deletions be/src/exec/orc_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,9 @@ Status ORCScanner::_open_next_orc_reader() {
if (st.is_end_of_file()) {
LOG(WARNING) << "Failed to init orc reader. filename: " << file_name << ", status: " << st.to_string();
continue;
} else if (st.is_not_found() &&
(_file_scan_type == TFileScanType::FILES_INSERT || _file_scan_type == TFileScanType::FILES_QUERY)) {
st = st.clone_and_append("Consider setting 'fill_mismatch_column_with' = 'null' property");
}
return st;
}
Expand Down
2 changes: 1 addition & 1 deletion be/src/exec/parquet_reader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ Status ParquetReaderWrap::column_indices(const std::vector<SlotDescriptor*>& tup
std::stringstream str_error;
str_error << "Column: " << slot_desc->col_name() << " is not found in file: " << _filename;
LOG(WARNING) << str_error.str();
return Status::InvalidArgument(str_error.str());
return Status::NotFound(str_error.str());
}
}
return Status::OK();
Expand Down
3 changes: 3 additions & 0 deletions be/src/exec/parquet_scanner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,9 @@ Status ParquetScanner::next_batch() {
_last_file_scan_bytes += incr_bytes;
_state->update_num_bytes_scan_from_source(incr_bytes);
}
} else if (status.is_not_found() && (_file_scan_type == TFileScanType::FILES_INSERT ||
_file_scan_type == TFileScanType::FILES_QUERY)) {
status = status.clone_and_append("Consider setting 'fill_mismatch_column_with' = 'null' property");
}
return status;
}
Expand Down
2 changes: 1 addition & 1 deletion test/sql/test_files/R/test_csv_files_merge
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ select * from files(
"auto_detect_sample_files" = "1",
"fill_mismatch_column_with" = "none");
-- result:
[REGEX].*Schema column count: 4 doesn't match source value column count: 3. Column separator: ',', Row delimiter: .*, Row: '4,Tom,30.4', File: .*basic0_column_mismatch.csv. Consider setting 'fill_mismatch_column_with' = 'null'.*
[REGEX].*Schema column count: 4 doesn't match source value column count: 3. Column separator: ',', Row delimiter: .*, Row: '4,Tom,30.4', File: .*basic0_column_mismatch.csv. Consider setting 'fill_mismatch_column_with' = 'null' property.*
-- !result


Expand Down
13 changes: 13 additions & 0 deletions test/sql/test_files/R/test_orc_files_merge
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,17 @@ None None None
-- !result


select * from files(
"path" = "oss://${oss_bucket}/test_files/orc_format/${uuid0}/*",
"format" = "orc",
"fill_mismatch_column_with" = "none",
"auto_detect_sample_files" = "2",
"aws.s3.access_key" = "${oss_ak}",
"aws.s3.secret_key" = "${oss_sk}",
"aws.s3.endpoint" = "${oss_endpoint}");
-- result:
[REGEX].*Column: k1 is not found in file: .*basic_type_k2k5k7.orc.* Consider setting 'fill_mismatch_column_with' = 'null' property.*
-- !result


shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/orc_format/${uuid0}/ > /dev/null
13 changes: 13 additions & 0 deletions test/sql/test_files/R/test_parquet_files_merge
Original file line number Diff line number Diff line change
Expand Up @@ -115,4 +115,17 @@ None None None
-- !result


select * from files(
"path" = "oss://${oss_bucket}/test_files/parquet_format/${uuid0}/*",
"format" = "parquet",
"fill_mismatch_column_with" = "none",
"auto_detect_sample_files" = "2",
"aws.s3.access_key" = "${oss_ak}",
"aws.s3.secret_key" = "${oss_sk}",
"aws.s3.endpoint" = "${oss_endpoint}");
-- result:
[REGEX].*Column: k1 is not found in file: .*basic_type_k2k5k7.parquet.* Consider setting 'fill_mismatch_column_with' = 'null' property.*
-- !result


shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/parquet_format/${uuid0}/ > /dev/null
10 changes: 10 additions & 0 deletions test/sql/test_files/T/test_orc_files_merge
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,14 @@ select k1, k3, k8 from files(
"aws.s3.secret_key" = "${oss_sk}",
"aws.s3.endpoint" = "${oss_endpoint}");

-- column mismatch
select * from files(
"path" = "oss://${oss_bucket}/test_files/orc_format/${uuid0}/*",
"format" = "orc",
"fill_mismatch_column_with" = "none",
"auto_detect_sample_files" = "2",
"aws.s3.access_key" = "${oss_ak}",
"aws.s3.secret_key" = "${oss_sk}",
"aws.s3.endpoint" = "${oss_endpoint}");

shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/orc_format/${uuid0}/ > /dev/null
10 changes: 10 additions & 0 deletions test/sql/test_files/T/test_parquet_files_merge
Original file line number Diff line number Diff line change
Expand Up @@ -59,4 +59,14 @@ select k1, k3, k8 from files(
"aws.s3.secret_key" = "${oss_sk}",
"aws.s3.endpoint" = "${oss_endpoint}");

-- column mismatch
select * from files(
"path" = "oss://${oss_bucket}/test_files/parquet_format/${uuid0}/*",
"format" = "parquet",
"fill_mismatch_column_with" = "none",
"auto_detect_sample_files" = "2",
"aws.s3.access_key" = "${oss_ak}",
"aws.s3.secret_key" = "${oss_sk}",
"aws.s3.endpoint" = "${oss_endpoint}");

shell: ossutil64 rm -rf oss://${oss_bucket}/test_files/parquet_format/${uuid0}/ > /dev/null
Loading