#17, #24 tests for read text and excel file - parse excel

added files for testing
InbarShirizly · Oct 5, 2020 · e161ce5 · e161ce5
1 parent 7eb85a7
commit e161ce5
Show file tree

Hide file tree

Showing 25 changed files with 341 additions and 76 deletions.
diff --git a/server/server/api/reports.py b/server/server/api/reports.py
@@ -1,6 +1,6 @@
 from server.api import api
 from flask_restful import Resource, reqparse, abort, marshal
-from server.parsing.attendance_check import Attendance
+from server.parsing.attendance import Attendance
 from werkzeug.datastructures import FileStorage
 from server import db, auth
 from datetime import datetime

diff --git a/server/server/config.py b/server/server/config.py
@@ -5,13 +5,13 @@ class FlaskConfig:
 
 class ParseConfig:
     FILE_COLS_DICT = {
-        "name": ["שם התלמיד", "תלמידים", "שמות", "שם", "סטודנט"],
-        "id_number": ["תעודת זהות", "ת.ז.", "ת.ז", "תז"],
-        "phone": ["טלפון", "מספר טלפון", "מס טלפון"],
-        "gender": ["מין"],
-        "org_class": ["כיתה"]
+        "name": ["שם התלמיד", "תלמידים", "שמות", "שם", "סטודנט", "name", "student_name", "student"],
+        "id_number": ["תעודת זהות", "ת.ז.", "ת.ז", "תז", "id", "number_id", "id_number"],
+        "phone": ["טלפון", "מספר טלפון", "מס טלפון", "phone", "phone_number"],
+        "gender": ["מין", "gender"],
+        "org_class": ["כיתה", "org_class", "class"]
     }
-    MASHOV_COLS = ["name", "org_class"]
+    MASHOV_COLS = ["name", "org_class", "id_number"]
     GENDER_DICT = {1: ["זכר", "ז", "(ז)"], 0: ["נקבה", "נ", "(נ)"]}
 
 

diff --git a/server/server/parsing/__init__.py b/server/server/parsing/__init__.py
@@ -1,5 +1,5 @@
 from server.config import ParseConfig
-from server.parsing.loading_classroom_file import ParseClassFile
+from server.parsing.parse_class_file import ParseClassFile
 from collections import namedtuple
 
 

diff --git a/server/server/parsing/attendance_check.py → server/server/parsing/attendance.py b/server/server/parsing/attendance_check.py → server/server/parsing/attendance.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 from server.parsing.session import Session
-#from server.parsing import AttendanceMetaData
+from server.parsing import AttendanceMetaData
 
 
 class Attendance:

diff --git a/server/server/parsing/loading_classroom_file.py b/server/server/parsing/loading_classroom_file.py
diff --git a/server/server/parsing/parse_class_file.py b/server/server/parsing/parse_class_file.py
@@ -0,0 +1,88 @@
+import pandas as pd
+import numpy as np
+import re
+
+DELETE_ROWS_CONTAIN = ["הופק בתאריך"]  #TODO: need to remove to config
+
+class ParseClassFile:
+
+    def __init__(self, file_cols_dict, mashov_cols, gender_dict):
+        self._file_cols_dict = file_cols_dict
+        self._mashov_cols = mashov_cols
+        self._gender_dict = gender_dict
+
+    @classmethod
+    def from_object(cls, config):
+        return cls(
+            config.FILE_COLS_DICT,
+            config.MASHOV_COLS,
+            config.GENDER_DICT
+        )
+
+    def parse_df(self, df_students):
+
+        if ParseClassFile.check_if_mashov_file(df_students):
+            df_students = self.mashov_file(df_students)
+        else:
+            df_students = self.classic_file(df_students)
+
+        for col in self._file_cols_dict.keys():
+            try:
+                df_students[col] = df_students[col]
+            except KeyError:
+                df_students[col] = pd.Series([np.nan] * df_students.shape[0])
+
+        final_df = df_students[list(self._file_cols_dict.keys())]
+
+        return final_df.reset_index().drop(columns="index")
+
+
+    @staticmethod
+    def check_if_mashov_file(df_students):
+        df_students.dropna(axis=0, how="all", inplace=True)
+        df_students.dropna(axis=1, how="all", inplace=True)
+
+        for col in df_students.columns:
+            if df_students[col].astype(str).str.match(r"(\d+.)([\u0590-\u05fe ]+)([(\u0590-\u05fe)]+)").any():
+                df_students.rename(columns={col: "name"}, inplace=True)
+                return True
+        return False
+
+    def mashov_file(self, df_students):
+        df_t = df_students.T
+        cols_to_drop = []
+        for col in df_t.columns:
+            if df_t[col].str.contains('|'.join(DELETE_ROWS_CONTAIN)).any():
+                cols_to_drop.append(col)
+        df_students = df_t.drop(columns=cols_to_drop).T
+
+        df_students.rename(columns={"ת.ז.": 'id_number', "כיתה": "org_class"}, inplace=True)
+        try:
+            df_students = df_students.loc[:, self._mashov_cols]
+        except KeyError:
+            raise ValueError("File content is invalid to the program configurations")
+
+        mashov_name_pattern = re.compile(r"([\u0590-\u05fe ]+)([(\u0590-\u05fe)]+)")
+        df_name_gender = df_students['name'].str.extract(mashov_name_pattern, expand=False)
+        df_students['gender'] = df_name_gender[1].str.extract("\(([\u0590-\u05fe ])\)")
+        df_students['gender'] = df_students['gender'].apply(self.gender_assign, gender_dict=self._gender_dict)
+        df_students['name'] = df_name_gender[0]
+        return df_students
+
+
+    def classic_file(self, df_students):
+        relevant_cols = [col for col in df_students.columns if not col.startswith("Unnamed")]
+        current_excel_dict = {}
+        for col in relevant_cols:
+            for key, col_options in self._file_cols_dict.items():
+                if col in col_options:
+                    current_excel_dict[key] = df_students[col]
+        return pd.DataFrame(current_excel_dict)
+
+
+    @staticmethod
+    def gender_assign(string, gender_dict):
+        for key, vals in gender_dict.items():
+            if string in vals:
+                return key
+        return ""
diff --git a/server/server/parsing/utils.py b/server/server/parsing/utils.py
@@ -16,9 +16,35 @@ def create_chat_df(chat_file):
 
 def create_students_df(file_name, file_data):
     if file_name.endswith(".csv"):
-        df_students = pd.read_csv(file_data)
+        df_students = pd.read_csv(file_data, header=None)
     elif file_name.endswith(".xlsx"):
-        df_students = pd.read_excel(file_data)
+        df_students = pd.read_excel(file_data, header=None)
     else:
-        df_students = pd.read_html(file_data, header=1)[0]
-    return df_students
+        try:
+            df_students = pd.read_html(file_data, header=1)[0]
+        except ValueError:
+            df_students = pd.ExcelFile(file_data).parse()
+
+    clean_df = clean_student_df(df_students)
+    return clean_df
+
+
+def clean_student_df(df_students):
+    # # first drop al columns that are totally missing (for extreme cases)
+    df_students.dropna(axis=0, how="all", inplace=True)
+    df_students.dropna(axis=1, how="all", inplace=True)
+
+    # check for unique values in columns - must have at list 3 unique values (min of title and 2 students
+    min_nunique_in_cols = max(df_students.nunique().median(), 3)
+    filt_relevant_cols = df_students.nunique() >= min_nunique_in_cols
+    df_students = df_students.loc[:, filt_relevant_cols]
+    df_students = pd.DataFrame(df_students.values[1:], columns=df_students.iloc[0])
+    return df_students
+
+
+def validate_file_content(df_student): # TODO: needs to be part of the full flow as well
+    if df_student.shape[0] > 200:
+        raise ValueError("Input file have to many records")  #TODO: pass amount of records as config
+    if df_student.empty:
+        raise ValueError("Entered file is empty")
+    return True
diff --git a/server/test/files_to_test/chat_files/chat_file_empty.txt b/server/test/files_to_test/chat_files/chat_file_empty.txt
diff --git a/server/test/files_to_test/chat_files/chat_file_not_structured.txt b/server/test/files_to_test/chat_files/chat_file_not_structured.txt
@@ -0,0 +1 @@
+I love to go to school
diff --git a/server/test/files_to_test/chat_files/chat_file_not_structured_partially.txt b/server/test/files_to_test/chat_files/chat_file_not_structured_partially.txt
@@ -0,0 +1,4 @@
+10:56:16	 From  
+10:56:18	 From  
+10:56:19	 From  
+10:56:20	 From 
diff --git a/server/test/files_to_test/chat_files/chat_file_valid.txt b/server/test/files_to_test/chat_files/chat_file_valid.txt
@@ -0,0 +1,22 @@
+10:56:16	 From  Tech Challenge : Attendance check:
+10:56:18	 From  Rozanna : here
+10:56:19	 From  Rozanna : מיכל קליימן
+10:56:20	 From  May Steinfeld : ענבר עדי
+10:56:22	 From  inbar shirizly : 305696031
+10:56:22	 From  Tech Challenge : Attendance check:
+10:56:22	 From  inbar shirizly : אביתר כהן
+10:56:23	 From  May Steinfeld : אלעד כהן
+10:56:23	 From  May Steinfeld : Attendance check
+10:56:24	 From  Tammuz Dubnov : עידן אביב
+10:56:24	 From  Daniel Kagan : here
+10:56:25	 From  Dana Makov : here
+10:58:26	 From  Ron Zehavi : 534234210
+
+11:56:16	 From  Tech Challenge : Attendance check:
+
+11:56:20	 From  May Steinfeld : אבי כהן
+11:56:22	 From  inbar shirizly : 305696031
+11:56:22	 From  inbar shirizly : אביתר כהן
+11:56:23	 From  May Steinfeld : 530342413
+11:56:24	 From  Tammuz Dubnov : עידן אביב
+11:59:24	 From  Tammuz Dubnov : מיתר כהן
diff --git a/server/test/files_to_test/students_list_excel/example_csv.csv b/server/test/files_to_test/students_list_excel/example_csv.csv
@@ -0,0 +1,10 @@
+id,phone,id_number,name,org_class,gender,class_id
+11,528702484,305049421,???? ????,,,
+12,524291930,123424343,????? ???,,,
+13,526148959,432424455,???? ??????,,,
+14,523454564,423423649,???? ???,,,
+15,530342423,305696031,??? ????,,,
+16,530342413,305696041,???? ??????,,,
+17,537642324,534234210,???? ???,,,
+,,,,,,
+18,537642324,534234453,???? ???,,,
diff --git a/server/test/files_to_test/students_list_excel/example_csv_2.csv b/server/test/files_to_test/students_list_excel/example_csv_2.csv
@@ -0,0 +1,8 @@
+id,phone,id_number,name,org_class,gender,class_id,
+11,528702484,305049421,???? ????,,,,
+12,524291930,123424343,????? ???,,,,
+13,526148959,432424455,???? ??????,,,,
+14,523454564,423423649,???? ???,,,,
+15,530342423,305696031,??? ????,,,,
+16,530342413,305696041,???? ??????,,,,
+17,537642324,534234210,???? ???,,,,
diff --git a/server/test/files_to_test/students_list_excel/example_csv_3.csv b/server/test/files_to_test/students_list_excel/example_csv_3.csv
@@ -0,0 +1 @@
+id,phone,id_number,name,org_class,gender,class_id,11,528702484,305049421,???? ????,,,,12,524291930,123424343,????? ???,,,,13,526148959,432424455,???? ??????,,,,14,523454564,423423649,???? ???,,,,15,530342423,305696031,??? ????,,,,16,530342413,305696041,???? ??????,,,,17,537642324,534234210,???? ???,,,,

diff --git a/server/test/files_to_test/students_list_excel/example_csv_4.csv b/server/test/files_to_test/students_list_excel/example_csv_4.csv
@@ -0,0 +1,8 @@
+id,phone,id_number,name,org_class,gender,class_id,
+11,528702484,305049421,???? ????,,,,
+12,524291930,123424343,????? ???,,,,
+13,526148959,432424455,???? ??????,,,,
+14,523454564,423423649,???? ???,,,,
+15,530342423,305696031,??? ????,,,,
+16,530342413,305696041,???? ??????,,,,
+17,537642324,534234210,???? ???,,,,
diff --git a/server/test/files_to_test/students_list_excel/example_excel.xlsx b/server/test/files_to_test/students_list_excel/example_excel.xlsx
diff --git a/server/test/files_to_test/students_list_excel/example_excel_start_in_random_row.xlsx b/server/test/files_to_test/students_list_excel/example_excel_start_in_random_row.xlsx
diff --git a/server/test/files_to_test/students_list_excel/example_excel_too_much_records.xlsx b/server/test/files_to_test/students_list_excel/example_excel_too_much_records.xlsx
diff --git a/server/test/files_to_test/students_list_excel/example_mashov_file_edited_and_saved_97.xls b/server/test/files_to_test/students_list_excel/example_mashov_file_edited_and_saved_97.xls
diff --git a/..._to_test/students_list_excel/example_mashov_file_edited_and_saved_97_with_filled_data.xls b/..._to_test/students_list_excel/example_mashov_file_edited_and_saved_97_with_filled_data.xls
diff --git a/server/test/files_to_test/students_list_excel/example_mashov_file_empty.xls b/server/test/files_to_test/students_list_excel/example_mashov_file_empty.xls
diff --git a/server/test/files_to_test/students_list_excel/דוגמה לרשימת תלמידים.xlsx b/server/test/files_to_test/students_list_excel/דוגמה לרשימת תלמידים.xlsx
diff --git a/server/test/test_attendance.py b/server/test/test_attendance.py
@@ -0,0 +1,26 @@
+import pytest
+import sys
+sys.path.append('../')
+from server.parsing.attendance import Attendance
+from server.parsing.utils import create_chat_df, create_students_df
+
+
+chat_file_path = r"C:\Users\Inbar Shirizly\Documents\python\useful\ITC_programs\zoom_attendance_check\chat files\meeting_example_full_name.txt"
+excel_file_path = r"C:\Users\Inbar Shirizly\Documents\python\useful\ITC_programs\zoom_attendance_check\student_csv_examples\example_data_already_prepared.xlsx"
+
+df_students = create_students_df(file_name=excel_file_path.split("\\")[-1], file_data=excel_file_path)
+
+
+@pytest.fixture()
+def chat_file():
+    with open(chat_file_path, "r", encoding="utf-8") as f:
+        chat_df = create_chat_df(f.readlines())
+    return chat_df
+
+
+
+@pytest.mark.parametrize("chat_df", [chat_df])
+@pytest.mark.parametrize("df_students", [df_students])
+def test_first_message_time(chat_df, df_students):
+        report = Attendance(chat_df, df_students, ['name', "id_number", "phone"], 1, "Attendance check", ["ITC", "Tech", "Challenge"])
+        assert report.first_message_time.hour == 10
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		id,phone,id_number,name,org_class,gender,class_id,11,528702484,305049421,???? ????,,,,12,524291930,123424343,????? ???,,,,13,526148959,432424455,???? ??????,,,,14,523454564,423423649,???? ???,,,,15,530342423,305696031,??? ????,,,,16,530342413,305696041,???? ??????,,,,17,537642324,534234210,???? ???,,,,
Expand Down