-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Dataset main class renaming. * Fix bug of cloning pipeline * Added a dataset builder with three types of failure specification modes: * Added local test for dataset building * Only one file with: An increasing feature that is the cumulated time of the piece being in place A column life id A column with a life en indicator * two files: data + list of failures A list of failures Upload multiple files Separated cycles
- Loading branch information
1 parent
7862b3c
commit 700458e
Showing
47 changed files
with
2,809 additions
and
798 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,4 +9,4 @@ | |
CACHE_PATH.mkdir(parents=True, exist_ok=True) | ||
|
||
|
||
__version__ = "2.0.6" | ||
__version__ = "3.0.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,180 @@ | ||
import logging | ||
import os | ||
from pathlib import Path | ||
from typing import Callable, List, Optional, Tuple, Union | ||
|
||
|
||
import pandas as pd | ||
from tqdm.auto import tqdm | ||
|
||
from ceruleo.dataset.builder.cycles_splitter import ( | ||
CyclesSplitter, | ||
FailureDataCycleSplitter, | ||
) | ||
from ceruleo.dataset.builder.output import OutputMode | ||
from ceruleo.dataset.builder.rul_column import RULColumn | ||
from ceruleo.dataset.ts_dataset import PDMDataset | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
def load_dataframe(path: Union[str, Path]) -> pd.DataFrame: | ||
if isinstance(path, str): | ||
path = Path(path) | ||
if not path.exists(): | ||
raise FileNotFoundError(f"File {path} does not exist") | ||
if path.suffix == ".csv": | ||
return pd.read_csv(path) | ||
if path.suffix == ".parquet": | ||
return pd.read_parquet(path) | ||
if path.suffix == ".xlsx": | ||
return pd.read_excel(path) | ||
raise ValueError(f"Unsupported file format {path.suffix}") | ||
|
||
|
||
class DatasetBuilder: | ||
splitter: CyclesSplitter | ||
output_mode: OutputMode | ||
rul_column: RULColumn | ||
dataframe_loader: Callable[[Union[str, Path]], pd.DataFrame] | ||
index_column: Optional[str] | ||
|
||
def __init__( | ||
self, | ||
dataframe_loader: Callable[[Union[str, Path]], pd.DataFrame] = load_dataframe, | ||
): | ||
"""Initializes the builder.""" | ||
self.output_mode = None | ||
self.splitter = None | ||
self.dataframe_loader = dataframe_loader | ||
self.index_column = None | ||
self.rul_column = None | ||
|
||
@staticmethod | ||
def one_file_format(): | ||
return DatasetBuilder() | ||
|
||
def set_splitting_method(self, splitter: CyclesSplitter): | ||
self.splitter = splitter | ||
return self | ||
|
||
def set_index_column(self, index_column: str): | ||
self.index_column = index_column | ||
return self | ||
|
||
def set_machine_id_feature(self, name: str): | ||
self._machine_type_feature = name | ||
return self | ||
|
||
def set_rul_column_method(self, rul_column: RULColumn): | ||
self.rul_column = rul_column | ||
return self | ||
|
||
def set_output_mode(self, output_mode: OutputMode): | ||
self.output_mode = output_mode | ||
return self | ||
|
||
def _validate(self): | ||
if self.output_mode is None: | ||
raise ValueError("Output mode not set") | ||
if self.splitter is None: | ||
raise ValueError("Splitting method not set") | ||
|
||
def build(self, input_path: Path): | ||
self._validate() | ||
self.splitter.split(input_path, self.output_mode) | ||
|
||
def prepare_from_data_fault_pairs_files( | ||
self, data_fault_pairs: Union[Tuple[str, str], List[Tuple[str, str]]] | ||
): | ||
if not isinstance(data_fault_pairs, list): | ||
data_fault_pairs = [data_fault_pairs] | ||
|
||
if not isinstance(self.splitter, FailureDataCycleSplitter): | ||
raise ValueError( | ||
"This method is only available for FailureDataCycleSplitter" | ||
) | ||
|
||
common_path_prefix = os.path.commonprefix( | ||
[data for data, fault in data_fault_pairs] | ||
) | ||
|
||
for i, (data, fault) in enumerate(tqdm(data_fault_pairs)): | ||
df_data = self.dataframe_loader(data) | ||
df_faults = self.dataframe_loader(fault) | ||
cycles_in_file = self.splitter.split(df_data, df_faults) | ||
for j, ds in enumerate(cycles_in_file): | ||
cycle_id = f"{i+1}_{j+1}" | ||
self._build_and_store_cycle( | ||
ds, | ||
cycle_id, | ||
metadata={ | ||
"Raw Data Filename": str(data.relative_to(common_path_prefix)), | ||
"Raw Fault Filename": str( | ||
fault.relative_to(common_path_prefix) | ||
), | ||
}, | ||
) | ||
self.output_mode.finish() | ||
|
||
def build_from_data_fault_pairs_files( | ||
self, data_fault_pairs: Union[Tuple[str, str], List[Tuple[str, str]]] | ||
) -> PDMDataset: | ||
self.prepare_from_data_fault_pairs_files(data_fault_pairs) | ||
return self.output_mode.build_dataset(self) | ||
|
||
def prepare_from_df( | ||
self, data: Union[pd.DataFrame, List[pd.DataFrame]] | ||
) -> PDMDataset: | ||
if not isinstance(data, list): | ||
data = [data] | ||
self._validate() | ||
for i, data_element in enumerate(data): | ||
for j, ds in enumerate(self.splitter.split(data_element)): | ||
cycle_id = f"{i+1}_{j+1}" | ||
self._build_and_store_cycle(ds, cycle_id) | ||
self.output_mode.finish() | ||
|
||
def build_from_df(self, data: Union[pd.DataFrame, List[pd.DataFrame]]): | ||
self.prepare_from_df(data) | ||
return self.output_mode.build_dataset(self) | ||
|
||
def prepare_from_data_fault_pair( | ||
self, | ||
data_fault_pairs: Union[ | ||
Tuple[pd.DataFrame, pd.DataFrame], List[Tuple[pd.DataFrame, pd.DataFrame]] | ||
], | ||
): | ||
if not isinstance(data_fault_pairs, list): | ||
data_fault_pairs = [data_fault_pairs] | ||
|
||
if not isinstance(self.splitter, FailureDataCycleSplitter): | ||
raise ValueError( | ||
"This method is only available for FailureDataCycleSplitter" | ||
) | ||
for i, (data, fault) in enumerate(tqdm(data_fault_pairs)): | ||
cycles_in_file = self.splitter.split(data, fault) | ||
for j, ds in enumerate(cycles_in_file): | ||
cycle_id = f"{i+1}_{j+1}" | ||
self._build_and_store_cycle( | ||
ds, | ||
cycle_id, | ||
) | ||
self.output_mode.finish() | ||
|
||
def build_from_data_fault_pair( | ||
self, | ||
data_fault_pairs: Union[ | ||
Tuple[pd.DataFrame, pd.DataFrame], List[Tuple[pd.DataFrame, pd.DataFrame]] | ||
], | ||
) -> PDMDataset: | ||
self.prepare_from_data_fault_pair(data_fault_pairs) | ||
return self.output_mode.build_dataset(self) | ||
|
||
def _build_and_store_cycle( | ||
self, ds: pd.DataFrame, cycle_id: any, metadata: dict = {} | ||
): | ||
ds["RUL"] = self.rul_column.get(ds) | ||
if self.index_column is not None: | ||
ds.set_index(self.index_column, inplace=True) | ||
self.output_mode.store(f"Cycle_{cycle_id}", ds, metadata) |
Oops, something went wrong.