Skip to content
This repository has been archived by the owner on Dec 21, 2023. It is now read-only.

Commit

Permalink
unity_sframe::append reorders sframe if needed (#2227)
Browse files Browse the repository at this point in the history
* 2222: move logic to unity_sframe

* use select_columns instead; with op_project operator

* ok, unity_sframe test should be in unity_sframe instead of sframe/
  • Loading branch information
Guihao Liang authored Aug 13, 2019
1 parent 60ed0b9 commit 4283945
Show file tree
Hide file tree
Showing 3 changed files with 37 additions and 67 deletions.
52 changes: 31 additions & 21 deletions src/core/storage/sframe_interface/unity_sframe.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -426,9 +426,9 @@ std::shared_ptr<unity_sarray_base> unity_sframe::select_column(const std::string
size_t column_index = _column_index_iter - _column_names.begin();
auto ret = select_column(column_index);

DASSERT_EQ(std::static_pointer_cast<unity_sarray>(ret)->dtype(), dtype(name));
DASSERT_EQ(std::static_pointer_cast<unity_sarray>(ret)->dtype(), dtype(name));

return ret;
return ret;
}

std::shared_ptr<unity_sarray_base> unity_sframe::select_column(size_t column_index) {
Expand All @@ -451,24 +451,24 @@ std::shared_ptr<unity_sframe_base> unity_sframe::select_columns(
#ifndef NDEBUG
{
std::shared_ptr<unity_sframe> X = std::static_pointer_cast<unity_sframe>(ret);

DASSERT_EQ(X->num_columns(), names.size());

for(size_t i = 0; i < names.size(); ++i) {
for(size_t i = 0; i < names.size(); ++i) {
DASSERT_EQ(names[i], X->column_name(i));
DASSERT_EQ(this->dtype(names[i]), X->dtype(i));
}
}
#endif

return ret;
return ret;
}

std::shared_ptr<unity_sframe_base> unity_sframe::select_columns(
const std::vector<size_t>& indices) {
Dlog_func_entry();

if(indices.empty()) {
if(indices.empty()) {
return std::make_shared<unity_sframe>();
}

Expand All @@ -479,7 +479,7 @@ std::shared_ptr<unity_sframe_base> unity_sframe::select_columns(
}

for(size_t i = 0; i < indices.size(); ++i) {
size_t col_idx = indices[i];
size_t col_idx = indices[i];
if(col_idx >= m_column_names.size()) {
std_log_and_throw(std::range_error, "Column index out of bounds.");
}
Expand Down Expand Up @@ -923,8 +923,10 @@ std::shared_ptr<unity_sframe_base> unity_sframe::append(
return ret;
}

// Error checking
// Error checking and reorder other sframe if necessary
{
bool needs_reorder = false;

if (this->num_columns() != other_sframe->num_columns()) {
log_and_throw("Two SFrames have different number of columns");
}
Expand All @@ -934,6 +936,8 @@ std::shared_ptr<unity_sframe_base> unity_sframe::append(
size_t num_columns = column_names.size();

if(column_names != other_column_names) {
needs_reorder = true;

std::sort(column_names.begin(), column_names.end());
std::sort(other_column_names.begin(), other_column_names.end());

Expand All @@ -951,8 +955,14 @@ std::shared_ptr<unity_sframe_base> unity_sframe::append(

log_and_throw(ss.str().c_str());
}

if (needs_reorder) {
other_sframe = std::static_pointer_cast<unity_sframe>(
other_sframe->select_columns(this->column_names()));
}
}


auto column_types = this->dtype();
auto other_column_types = other_sframe->dtype();

Expand Down Expand Up @@ -1278,11 +1288,11 @@ unity_sframe::sort(const std::vector<std::string>& sort_keys,
}

std::vector<size_t> sort_indices;
if(sort_keys.empty()) {
sort_indices.resize(this->num_columns());
std::iota(sort_indices.begin(), sort_indices.end(), 0);
} else {

if(sort_keys.empty()) {
sort_indices.resize(this->num_columns());
std::iota(sort_indices.begin(), sort_indices.end(), 0);
} else {
sort_indices = _convert_column_names_to_indices(sort_keys);
}

Expand Down Expand Up @@ -1602,14 +1612,14 @@ std::list<std::shared_ptr<unity_sframe_base>> unity_sframe::drop_missing_values(
} else {

std::vector<size_t> column_indices;
if(column_names.empty()) {
column_indices.resize(this->num_columns());
std::iota(column_indices.begin(), column_indices.end(), 0);
} else {

if(column_names.empty()) {
column_indices.resize(this->num_columns());
std::iota(column_indices.begin(), column_indices.end(), 0);
} else {
column_indices = _convert_column_names_to_indices(column_names);
}

// Separate out the columns that require contains_na, which is more expensive.
size_t n_recursive = 0, n_simple = column_indices.size();

Expand Down Expand Up @@ -1755,7 +1765,7 @@ std::vector<size_t> unity_sframe::_convert_column_names_to_indices(
#ifndef NDEBUG
{
DASSERT_EQ(column_indices.size(), column_names.size());
for(size_t i = 0; i < column_names.size(); ++i ) {
for(size_t i = 0; i < column_names.size(); ++i ) {
DASSERT_EQ(column_names[i], this->column_name(column_indices[i]));
}
}
Expand Down Expand Up @@ -2157,7 +2167,7 @@ void unity_sframe::explore(const std::string& path_to_client, const std::string&
std::string column_name;

enum class MethodType {None = 0, GetRows = 1, GetAccordion = 2};
MethodType response = MethodType::None;
MethodType response = MethodType::None;

auto sa = gl_sarray(std::vector<flexible_type>(1, input)).astype(flex_type_enum::DICT);
flex_dict dict = sa[0].get<flex_dict>();
Expand Down
42 changes: 1 addition & 41 deletions src/python/turicreate/data_structures/sframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3781,48 +3781,8 @@ def append(self, other):
if type(other) is not SFrame:
raise RuntimeError("SFrame append can only work with SFrame")

left_empty = len(self.column_names()) == 0
right_empty = len(other.column_names()) == 0

if (left_empty and right_empty):
return SFrame()

if (left_empty or right_empty):
non_empty_sframe = self if right_empty else other
return non_empty_sframe.__copy__()

my_column_names = self.column_names()
my_column_types = self.column_types()
other_column_names = other.column_names()
if (len(my_column_names) != len(other_column_names)):
raise RuntimeError("Two SFrames have to have the same number of columns")

# check if the order of column name is the same
column_name_order_match = True
for i in range(len(my_column_names)):
if other_column_names[i] != my_column_names[i]:
column_name_order_match = False
break

processed_other_frame = other
if not column_name_order_match:
# we allow name order of two sframes to be different, so we create a new sframe from
# "other" sframe to make it has exactly the same shape
processed_other_frame = SFrame()
for i in range(len(my_column_names)):
col_name = my_column_names[i]
if(col_name not in other_column_names):
raise RuntimeError("Column " + my_column_names[i] + " does not exist in second SFrame")

other_column = other.select_column(col_name)
processed_other_frame.add_column(other_column, col_name, inplace=True)

# check column type
if my_column_types[i] != other_column.dtype:
raise RuntimeError("Column " + my_column_names[i] + " type is not the same in two SFrames, one is " + str(my_column_types[i]) + ", the other is " + str(other_column.dtype))

with cython_context():
return SFrame(_proxy=self.__proxy__.append(processed_other_frame.__proxy__))
return SFrame(_proxy=self.__proxy__.append(other.__proxy__))

def groupby(self, key_column_names, operations, *args):
"""
Expand Down
10 changes: 5 additions & 5 deletions test/unity/unity_sframe.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ struct unity_sframe_test {
tmp_vec.clear();

// Filter by all 0's
res_ptr = sf->logical_filter(unity_zero);
res_ptr = sf->logical_filter(unity_zero);
sa_res_ptr = sa->logical_filter(unity_zero);
TS_ASSERT_EQUALS(res_ptr->size(), 0);
TS_ASSERT_EQUALS(sa_res_ptr->size(), 0);
Expand Down Expand Up @@ -335,9 +335,9 @@ struct unity_sframe_test {
sf2->add_column(sa2, "b");
sf2->add_column(sa1, "a");

TS_ASSERT_THROWS_ANYTHING(sf1->append(sf2));
TS_ASSERT_THROWS_NOTHING(sf1->append(sf2));
}

void test_append_type_mismatch() {
std::vector<flexible_type> test_data1;
std::vector<flexible_type> test_data2;
Expand All @@ -358,7 +358,7 @@ struct unity_sframe_test {

TS_ASSERT_THROWS_ANYTHING(sf1->append(sf2));
}

void test_append() {
dataframe_t testdf = _create_test_dataframe();

Expand All @@ -385,7 +385,7 @@ struct unity_sframe_test {
TS_ASSERT_EQUALS(sf3_value.values["c"][i], testdf.values["c"][i - sf1->size()]);
}
}

void test_append_empty() {
auto sf1 = std::make_shared<unity_sframe>();
auto sf2 = std::make_shared<unity_sframe>();
Expand Down

0 comments on commit 4283945

Please sign in to comment.