diff --git a/src/core/storage/sframe_interface/unity_sframe.cpp b/src/core/storage/sframe_interface/unity_sframe.cpp index d713cfa549..b0bff7c4b2 100644 --- a/src/core/storage/sframe_interface/unity_sframe.cpp +++ b/src/core/storage/sframe_interface/unity_sframe.cpp @@ -426,9 +426,9 @@ std::shared_ptr unity_sframe::select_column(const std::string size_t column_index = _column_index_iter - _column_names.begin(); auto ret = select_column(column_index); - DASSERT_EQ(std::static_pointer_cast(ret)->dtype(), dtype(name)); + DASSERT_EQ(std::static_pointer_cast(ret)->dtype(), dtype(name)); - return ret; + return ret; } std::shared_ptr unity_sframe::select_column(size_t column_index) { @@ -451,24 +451,24 @@ std::shared_ptr unity_sframe::select_columns( #ifndef NDEBUG { std::shared_ptr X = std::static_pointer_cast(ret); - + DASSERT_EQ(X->num_columns(), names.size()); - for(size_t i = 0; i < names.size(); ++i) { + for(size_t i = 0; i < names.size(); ++i) { DASSERT_EQ(names[i], X->column_name(i)); DASSERT_EQ(this->dtype(names[i]), X->dtype(i)); } } #endif - return ret; + return ret; } std::shared_ptr unity_sframe::select_columns( const std::vector& indices) { Dlog_func_entry(); - if(indices.empty()) { + if(indices.empty()) { return std::make_shared(); } @@ -479,7 +479,7 @@ std::shared_ptr unity_sframe::select_columns( } for(size_t i = 0; i < indices.size(); ++i) { - size_t col_idx = indices[i]; + size_t col_idx = indices[i]; if(col_idx >= m_column_names.size()) { std_log_and_throw(std::range_error, "Column index out of bounds."); } @@ -923,8 +923,10 @@ std::shared_ptr unity_sframe::append( return ret; } - // Error checking + // Error checking and reorder other sframe if necessary { + bool needs_reorder = false; + if (this->num_columns() != other_sframe->num_columns()) { log_and_throw("Two SFrames have different number of columns"); } @@ -934,6 +936,8 @@ std::shared_ptr unity_sframe::append( size_t num_columns = column_names.size(); if(column_names != other_column_names) { + needs_reorder = true; + std::sort(column_names.begin(), column_names.end()); std::sort(other_column_names.begin(), other_column_names.end()); @@ -951,8 +955,14 @@ std::shared_ptr unity_sframe::append( log_and_throw(ss.str().c_str()); } + + if (needs_reorder) { + other_sframe = std::static_pointer_cast( + other_sframe->select_columns(this->column_names())); + } } + auto column_types = this->dtype(); auto other_column_types = other_sframe->dtype(); @@ -1278,11 +1288,11 @@ unity_sframe::sort(const std::vector& sort_keys, } std::vector sort_indices; - - if(sort_keys.empty()) { - sort_indices.resize(this->num_columns()); - std::iota(sort_indices.begin(), sort_indices.end(), 0); - } else { + + if(sort_keys.empty()) { + sort_indices.resize(this->num_columns()); + std::iota(sort_indices.begin(), sort_indices.end(), 0); + } else { sort_indices = _convert_column_names_to_indices(sort_keys); } @@ -1602,14 +1612,14 @@ std::list> unity_sframe::drop_missing_values( } else { std::vector column_indices; - - if(column_names.empty()) { - column_indices.resize(this->num_columns()); - std::iota(column_indices.begin(), column_indices.end(), 0); - } else { + + if(column_names.empty()) { + column_indices.resize(this->num_columns()); + std::iota(column_indices.begin(), column_indices.end(), 0); + } else { column_indices = _convert_column_names_to_indices(column_names); } - + // Separate out the columns that require contains_na, which is more expensive. size_t n_recursive = 0, n_simple = column_indices.size(); @@ -1755,7 +1765,7 @@ std::vector unity_sframe::_convert_column_names_to_indices( #ifndef NDEBUG { DASSERT_EQ(column_indices.size(), column_names.size()); - for(size_t i = 0; i < column_names.size(); ++i ) { + for(size_t i = 0; i < column_names.size(); ++i ) { DASSERT_EQ(column_names[i], this->column_name(column_indices[i])); } } @@ -2157,7 +2167,7 @@ void unity_sframe::explore(const std::string& path_to_client, const std::string& std::string column_name; enum class MethodType {None = 0, GetRows = 1, GetAccordion = 2}; - MethodType response = MethodType::None; + MethodType response = MethodType::None; auto sa = gl_sarray(std::vector(1, input)).astype(flex_type_enum::DICT); flex_dict dict = sa[0].get(); diff --git a/src/python/turicreate/data_structures/sframe.py b/src/python/turicreate/data_structures/sframe.py index 7039bd1309..356ba616f6 100644 --- a/src/python/turicreate/data_structures/sframe.py +++ b/src/python/turicreate/data_structures/sframe.py @@ -3781,48 +3781,8 @@ def append(self, other): if type(other) is not SFrame: raise RuntimeError("SFrame append can only work with SFrame") - left_empty = len(self.column_names()) == 0 - right_empty = len(other.column_names()) == 0 - - if (left_empty and right_empty): - return SFrame() - - if (left_empty or right_empty): - non_empty_sframe = self if right_empty else other - return non_empty_sframe.__copy__() - - my_column_names = self.column_names() - my_column_types = self.column_types() - other_column_names = other.column_names() - if (len(my_column_names) != len(other_column_names)): - raise RuntimeError("Two SFrames have to have the same number of columns") - - # check if the order of column name is the same - column_name_order_match = True - for i in range(len(my_column_names)): - if other_column_names[i] != my_column_names[i]: - column_name_order_match = False - break - - processed_other_frame = other - if not column_name_order_match: - # we allow name order of two sframes to be different, so we create a new sframe from - # "other" sframe to make it has exactly the same shape - processed_other_frame = SFrame() - for i in range(len(my_column_names)): - col_name = my_column_names[i] - if(col_name not in other_column_names): - raise RuntimeError("Column " + my_column_names[i] + " does not exist in second SFrame") - - other_column = other.select_column(col_name) - processed_other_frame.add_column(other_column, col_name, inplace=True) - - # check column type - if my_column_types[i] != other_column.dtype: - raise RuntimeError("Column " + my_column_names[i] + " type is not the same in two SFrames, one is " + str(my_column_types[i]) + ", the other is " + str(other_column.dtype)) - with cython_context(): - return SFrame(_proxy=self.__proxy__.append(processed_other_frame.__proxy__)) + return SFrame(_proxy=self.__proxy__.append(other.__proxy__)) def groupby(self, key_column_names, operations, *args): """ diff --git a/test/unity/unity_sframe.cxx b/test/unity/unity_sframe.cxx index 06fdc9d939..bced8ba349 100644 --- a/test/unity/unity_sframe.cxx +++ b/test/unity/unity_sframe.cxx @@ -178,7 +178,7 @@ struct unity_sframe_test { tmp_vec.clear(); // Filter by all 0's - res_ptr = sf->logical_filter(unity_zero); + res_ptr = sf->logical_filter(unity_zero); sa_res_ptr = sa->logical_filter(unity_zero); TS_ASSERT_EQUALS(res_ptr->size(), 0); TS_ASSERT_EQUALS(sa_res_ptr->size(), 0); @@ -335,9 +335,9 @@ struct unity_sframe_test { sf2->add_column(sa2, "b"); sf2->add_column(sa1, "a"); - TS_ASSERT_THROWS_ANYTHING(sf1->append(sf2)); + TS_ASSERT_THROWS_NOTHING(sf1->append(sf2)); } - + void test_append_type_mismatch() { std::vector test_data1; std::vector test_data2; @@ -358,7 +358,7 @@ struct unity_sframe_test { TS_ASSERT_THROWS_ANYTHING(sf1->append(sf2)); } - + void test_append() { dataframe_t testdf = _create_test_dataframe(); @@ -385,7 +385,7 @@ struct unity_sframe_test { TS_ASSERT_EQUALS(sf3_value.values["c"][i], testdf.values["c"][i - sf1->size()]); } } - + void test_append_empty() { auto sf1 = std::make_shared(); auto sf2 = std::make_shared();