KCL-BMEIS · deng113jie · Oct 18, 2021 · Mar 11, 2021 · Mar 12, 2021 · Mar 12, 2021
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -12,8 +12,11 @@ on:
 jobs:
   build:
 
-    runs-on: ubuntu-latest
-
+    runs-on: ${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [macos-latest, windows-latest, ubuntu-latest] 
+
     steps:
     - uses: actions/checkout@v2
     - name: Set up Python 3.7
@@ -23,8 +26,7 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install flake8 numpy numba pandas h5py
-        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        pip install flake8 numpy numba pandas h5py cython
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
@@ -33,4 +35,4 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test with unittest
       run: |
-        python -m unittest tests/*
+        python -m unittest
diff --git a/.github/workflows/python-publish.yml → .github/workflows/python-publish-linux.yml b/.github/workflows/python-publish.yml → .github/workflows/python-publish-linux.yml
@@ -6,7 +6,7 @@
 # separate terms of service, privacy policy, and support
 # documentation.
 
-name: Upload Python Package
+name: Build & upload package on Linux
 
 on:
   release:
@@ -26,9 +26,14 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install build
+        pip install build setuptools wheel cython twine
+    - name: Set up GCC
+      uses: egor-tensin/setup-gcc@v1
+      with:
+        version: latest
+        platform: x64
     - name: Build package
-      run: python -m build
+      run: python setup.py bdist_wheel
     - name: Publish package
       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
       with:

diff --git a/.github/workflows/python-publish-macos.yml b/.github/workflows/python-publish-macos.yml
@@ -0,0 +1,37 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Build & upload package on MacOS
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  deploy:
+
+    runs-on: macos-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build setuptools wheel cython twine
+    - name: Build package
+      run: python setup.py bdist_wheel
+    - name: Publish package
+      run: |
+        python3 -m twine upload dist/*
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/.github/workflows/python-publish-win.yml b/.github/workflows/python-publish-win.yml
@@ -0,0 +1,41 @@
+# This workflow will upload a Python Package using Twine when a release is created
+# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
+
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+name: Build & upload package on Windows
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  deploy:
+
+    runs-on: windows-latest
+
+    steps:
+    - uses: actions/checkout@v2
+    - name: Set up Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: '3.x'
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build setuptools wheel cython twine
+    - name: Set up MinGW
+      uses: egor-tensin/setup-mingw@v2
+      with:
+       platform: x64
+    - name: Build package
+      run: python setup.py bdist_wheel
+    - name: Publish package
+      run: |
+        python3 -m twine upload dist/*
+      env:
+        TWINE_USERNAME: __token__
+        TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/exetera/_libs/ops.pyx b/exetera/_libs/ops.pyx
@@ -0,0 +1,8 @@
+def fib(n):
+    """Print the Fibonacci series up to n."""
+    a, b = 0, 1
+    while b < n:
+        print(b)
+        a, b = b, a + b
+
+    print()
diff --git a/exetera/core/dataframe.py b/exetera/core/dataframe.py
@@ -565,6 +565,168 @@ def groupby(self, by: Union[str, List[str]], hint_keys_is_sorted=False):
 
         return HDF5DataFrameGroupBy(self._columns, by, sorted_index, spans)
 
+    def describe(self, include=None, exclude=None):
+        """
+        Show the basic statistics of the data in each field.
+
+        :param include: The field name or data type or simply 'all' to indicate the fields included in the calculation.
+        :param exclude: The filed name or data type to exclude in the calculation.
+        :return: A dataframe contains the statistic results.
+
+        """
+        # check include and exclude conflicts
+        if include is not None and exclude is not None:
+            if isinstance(include, str):
+                raise ValueError('Please do not use exclude parameter when include is set as a single field.')
+            elif isinstance(include, type):
+                if isinstance(exclude, type) or (isinstance(exclude, list) and isinstance(exclude[0], type)):
+                    raise ValueError(
+                        'Please do not use set exclude as a type when include is set as a single data type.')
+            elif isinstance(include, list):
+                if isinstance(include[0], str) and isinstance(exclude, str):
+                    raise ValueError('Please do not use exclude as the same type as the include parameter.')
+                elif isinstance(include[0], str) and isinstance(exclude, list) and isinstance(exclude[0], str):
+                    raise ValueError('Please do not use exclude as the same type as the include parameter.')
+                elif isinstance(include[0], type) and isinstance(exclude, type):
+                    raise ValueError('Please do not use exclude as the same type as the include parameter.')
+                elif isinstance(include[0], type) and isinstance(exclude, list) and isinstance(exclude[0], type):
+                    raise ValueError('Please do not use exclude as the same type as the include parameter.')
+
+        fields_to_calculate = []
+        if include is not None:
+            if isinstance(include, str):  # a single str
+                if include == 'all':
+                    fields_to_calculate = list(self.columns.keys())
+                elif include in self.columns.keys():
+                    fields_to_calculate = [include]
+                else:
+                    raise ValueError('The field to include in not in the dataframe.')
+            elif isinstance(include, type):  # a single type
+                for f in self.columns:
+                    if not self[f].indexed and np.issubdtype(self[f].data.dtype, include):
+                        fields_to_calculate.append(f)
+                if len(fields_to_calculate) == 0:
+                    raise ValueError('No such type appeared in the dataframe.')
+            elif isinstance(include, list) and isinstance(include[0], str):  # a list of str
+                for f in include:
+                    if f in self.columns.keys():
+                        fields_to_calculate.append(f)
+                if len(fields_to_calculate) == 0:
+                    raise ValueError('The fields to include in not in the dataframe.')
+
+            elif isinstance(include, list) and isinstance(include[0], type):  # a list of type
+                for t in include:
+                    for f in self.columns:
+                        if not self[f].indexed and np.issubdtype(self[f].data.dtype, t):
+                            fields_to_calculate.append(f)
+                if len(fields_to_calculate) == 0:
+                    raise ValueError('No such type appeared in the dataframe.')
+
+            else:
+                raise ValueError('The include parameter can only be str, dtype, or list of either.')
+
+        else:  # include is None, numeric & timestamp fields only (no indexed strings) TODO confirm the type
+            for f in self.columns:
+                if isinstance(self[f], fld.NumericField) or isinstance(self[f], fld.TimestampField):
+                    fields_to_calculate.append(f)
+
+        if len(fields_to_calculate) == 0:
+            raise ValueError('No fields included to describe.')
+
+        if exclude is not None:
+            if isinstance(exclude, str):
+                if exclude in fields_to_calculate:  # exclude
+                    fields_to_calculate.remove(exclude)  # remove from list
+            elif isinstance(exclude, type):  # a type
+                for f in fields_to_calculate:
+                    if np.issubdtype(self[f].data.dtype, exclude):
+                        fields_to_calculate.remove(f)
+            elif isinstance(exclude, list) and isinstance(exclude[0], str):  # a list of str
+                for f in exclude:
+                    fields_to_calculate.remove(f)
+
+            elif isinstance(exclude, list) and isinstance(exclude[0], type):  # a list of type
+                for t in exclude:
+                    for f in fields_to_calculate:
+                        if np.issubdtype(self[f].data.dtype, t):
+                            fields_to_calculate.remove(f)  # remove will raise valueerror if dtype not presented
+
+            else:
+                raise ValueError('The exclude parameter can only be str, dtype, or list of either.')
+
+        if len(fields_to_calculate) == 0:
+            raise ValueError('All fields are excluded, no field left to describe.')
+        # if flexible (str) fields
+        des_idxstr = False
+        for f in fields_to_calculate:
+            if isinstance(self[f], fld.CategoricalField) or isinstance(self[f], fld.FixedStringField) or isinstance(
+                    self[f], fld.IndexedStringField):
+                des_idxstr = True
+        # calculation
+        result = {'fields': [], 'count': [], 'mean': [], 'std': [], 'min': [], '25%': [], '50%': [], '75%': [],
+                  'max': []}
+
+        # count
+        if des_idxstr:
+            result['unique'], result['top'], result['freq'] = [], [], []
+
+        for f in fields_to_calculate:
+            result['fields'].append(f)
+            result['count'].append(len(self[f].data))
+
+            if des_idxstr and (isinstance(self[f], fld.NumericField) or isinstance(self[f],
+                                                                                   fld.TimestampField)):  # numberic, timestamp
+                result['unique'].append('NaN')
+                result['top'].append('NaN')
+                result['freq'].append('NaN')
+
+                result['mean'].append("{:.2f}".format(np.mean(self[f].data[:])))
+                result['std'].append("{:.2f}".format(np.std(self[f].data[:])))
+                result['min'].append("{:.2f}".format(np.min(self[f].data[:])))
+                result['25%'].append("{:.2f}".format(np.percentile(self[f].data[:], 0.25)))
+                result['50%'].append("{:.2f}".format(np.percentile(self[f].data[:], 0.5)))
+                result['75%'].append("{:.2f}".format(np.percentile(self[f].data[:], 0.75)))
+                result['max'].append("{:.2f}".format(np.max(self[f].data[:])))
+
+            elif des_idxstr and (isinstance(self[f], fld.CategoricalField) or isinstance(self[f],
+                                                                                         fld.IndexedStringField) or isinstance(
+                self[f], fld.FixedStringField)):  # categorical & indexed string & fixed string
+                a, b = np.unique(self[f].data[:], return_counts=True)
+                result['unique'].append(len(a))
+                result['top'].append(a[np.argmax(b)])
+                result['freq'].append(b[np.argmax(b)])
+
+                result['mean'].append('NaN')
+                result['std'].append('NaN')
+                result['min'].append('NaN')
+                result['25%'].append('NaN')
+                result['50%'].append('NaN')
+                result['75%'].append('NaN')
+                result['max'].append('NaN')
+
+            elif not des_idxstr:
+                result['mean'].append("{:.2f}".format(np.mean(self[f].data[:])))
+                result['std'].append("{:.2f}".format(np.std(self[f].data[:])))
+                result['min'].append("{:.2f}".format(np.min(self[f].data[:])))
+                result['25%'].append("{:.2f}".format(np.percentile(self[f].data[:], 0.25)))
+                result['50%'].append("{:.2f}".format(np.percentile(self[f].data[:], 0.5)))
+                result['75%'].append("{:.2f}".format(np.percentile(self[f].data[:], 0.75)))
+                result['max'].append("{:.2f}".format(np.max(self[f].data[:])))
+
+        # display
+        columns_to_show = ['fields', 'count', 'unique', 'top', 'freq', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']
+        # 5 fields each time for display
+        for col in range(0, len(result['fields']), 5):  # 5 column each time
+            for i in columns_to_show:
+                if i in result:
+                    print(i, end='\t')
+                    for f in result[i][col:col + 5 if col + 5 < len(result[i]) - 1 else len(result[i])]:
+                        print('{:>15}'.format(f), end='\t')
+                    print('')
+            print('\n')
+
+        return result
+
 
 
 class HDF5DataFrameGroupBy(DataFrameGroupBy):

diff --git a/exetera/core/field_importers.py b/exetera/core/field_importers.py
@@ -5,7 +5,8 @@
 from exetera.core import operations as ops
 from exetera.core.data_writer import DataWriter
 from exetera.core import utils
-from datetime import datetime, date
+from datetime import datetime, date, timezone
+import pytz
 
 INDEXED_STRING_FIELD_SIZE = 10 # guessing
 
@@ -307,14 +308,14 @@ def write_part(self, values):
                     # ts = datetime.strptime(value.decode(), '%Y-%m-%d %H:%M:%S.%f%z')
                     v_datetime = datetime(int(value[0:4]), int(value[5:7]), int(value[8:10]),
                                           int(value[11:13]), int(value[14:16]), int(value[17:19]),
-                                          int(value[20:26]))
+                                          int(value[20:26]), tzinfo=timezone.utc)
                 elif v_len == 25:
                     # ts = datetime.strptime(value.decode(), '%Y-%m-%d %H:%M:%S%z')
                     v_datetime = datetime(int(value[0:4]), int(value[5:7]), int(value[8:10]),
-                                          int(value[11:13]), int(value[14:16]), int(value[17:19]))
+                                          int(value[11:13]), int(value[14:16]), int(value[17:19]), tzinfo=timezone.utc)
                 elif v_len == 19:
                     v_datetime = datetime(int(value[0:4]), int(value[5:7]), int(value[8:10]),
-                                          int(value[11:13]), int(value[14:16]), int(value[17:19]))
+                                          int(value[11:13]), int(value[14:16]), int(value[17:19]), tzinfo=timezone.utc)
                 else:
                     raise ValueError(f"Date field '{self.field}' has unexpected format '{value}'")
                 datetime_ts[i] = v_datetime.timestamp()
@@ -362,6 +363,7 @@ def write_part(self, values):
                 flags[i] = False
             else:
                 ts = datetime.strptime(value.decode(), '%Y-%m-%d')
+                ts = ts.replace(tzinfo=timezone.utc)
                 date_ts[i] = ts.timestamp()
 
         self.field.data.write_part(date_ts)

diff --git a/exetera/core/fields.py b/exetera/core/fields.py
@@ -1557,8 +1557,14 @@ def nformat(self):
     @property
     def keys(self):
         self._ensure_valid()
-        kv = self._field['key_values']
-        kn = self._field['key_names']
+        if isinstance(self._field['key_values'][0], str):  # convert into bytearray to keep up with linux
+            kv = [bytes(i, 'utf-8') for i in self._field['key_values']]
+        else:
+            kv = self._field['key_values']
+        if isinstance(self._field['key_names'][0], str):
+            kn = [bytes(i, 'utf-8') for i in self._field['key_names']]
+        else:
+            kn = self._field['key_names']
         keys = dict(zip(kv, kn))
         return keys
 

diff --git a/exetera/core/persistence.py b/exetera/core/persistence.py
@@ -169,7 +169,7 @@ def _apply_sort_to_array(index, values):
 @njit
 def _apply_sort_to_index_values(index, indices, values):
 
-    s_indices = np.zeros_like(indices)
+    s_indices = np.zeros_like(indices, dtype=np.int64)
     s_values = np.zeros_like(values)
     accumulated = np.int64(0)
     s_indices[0] = 0
@@ -1029,7 +1029,7 @@ def apply_spans_concat(self, spans, reader, writer):
 
         src_index = reader.field['index'][:]
         src_values = reader.field['values'][:]
-        dest_index = np.zeros(reader.chunksize, src_index.dtype)
+        dest_index = np.zeros(reader.chunksize, np.int64)
         dest_values = np.zeros(reader.chunksize * 16, src_values.dtype)
 
         max_index_i = reader.chunksize

diff --git a/exetera/core/readerwriter.py b/exetera/core/readerwriter.py
@@ -60,7 +60,7 @@ def dtype(self):
         return self.field['index'].dtype, self.field['values'].dtype
 
     def sort(self, index, writer):
-        field_index = self.field['index'][:]
+        field_index = np.array(self.field['index'][:], dtype=np.int64)
         field_values = self.field['values'][:]
         r_field_index, r_field_values =\
             pers._apply_sort_to_index_values(index, field_index, field_values)