Skip to content

Commit

Permalink
#17, #24 tests for read text and excel file - parse excel
Browse files Browse the repository at this point in the history
added files for testing
  • Loading branch information
InbarShirizly committed Oct 5, 2020
1 parent 7eb85a7 commit e161ce5
Show file tree
Hide file tree
Showing 25 changed files with 341 additions and 76 deletions.
2 changes: 1 addition & 1 deletion server/server/api/reports.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from server.api import api
from flask_restful import Resource, reqparse, abort, marshal
from server.parsing.attendance_check import Attendance
from server.parsing.attendance import Attendance
from werkzeug.datastructures import FileStorage
from server import db, auth
from datetime import datetime
Expand Down
12 changes: 6 additions & 6 deletions server/server/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,13 @@ class FlaskConfig:

class ParseConfig:
FILE_COLS_DICT = {
"name": ["שם התלמיד", "תלמידים", "שמות", "שם", "סטודנט"],
"id_number": ["תעודת זהות", "ת.ז.", "ת.ז", "תז"],
"phone": ["טלפון", "מספר טלפון", "מס טלפון"],
"gender": ["מין"],
"org_class": ["כיתה"]
"name": ["שם התלמיד", "תלמידים", "שמות", "שם", "סטודנט", "name", "student_name", "student"],
"id_number": ["תעודת זהות", "ת.ז.", "ת.ז", "תז", "id", "number_id", "id_number"],
"phone": ["טלפון", "מספר טלפון", "מס טלפון", "phone", "phone_number"],
"gender": ["מין", "gender"],
"org_class": ["כיתה", "org_class", "class"]
}
MASHOV_COLS = ["name", "org_class"]
MASHOV_COLS = ["name", "org_class", "id_number"]
GENDER_DICT = {1: ["זכר", "ז", "(ז)"], 0: ["נקבה", "נ", "(נ)"]}


Expand Down
2 changes: 1 addition & 1 deletion server/server/parsing/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from server.config import ParseConfig
from server.parsing.loading_classroom_file import ParseClassFile
from server.parsing.parse_class_file import ParseClassFile
from collections import namedtuple


Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd
from server.parsing.session import Session
#from server.parsing import AttendanceMetaData
from server.parsing import AttendanceMetaData


class Attendance:
Expand Down
63 changes: 0 additions & 63 deletions server/server/parsing/loading_classroom_file.py

This file was deleted.

88 changes: 88 additions & 0 deletions server/server/parsing/parse_class_file.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
import pandas as pd
import numpy as np
import re

DELETE_ROWS_CONTAIN = ["הופק בתאריך"] #TODO: need to remove to config

class ParseClassFile:

def __init__(self, file_cols_dict, mashov_cols, gender_dict):
self._file_cols_dict = file_cols_dict
self._mashov_cols = mashov_cols
self._gender_dict = gender_dict

@classmethod
def from_object(cls, config):
return cls(
config.FILE_COLS_DICT,
config.MASHOV_COLS,
config.GENDER_DICT
)

def parse_df(self, df_students):

if ParseClassFile.check_if_mashov_file(df_students):
df_students = self.mashov_file(df_students)
else:
df_students = self.classic_file(df_students)

for col in self._file_cols_dict.keys():
try:
df_students[col] = df_students[col]
except KeyError:
df_students[col] = pd.Series([np.nan] * df_students.shape[0])

final_df = df_students[list(self._file_cols_dict.keys())]

return final_df.reset_index().drop(columns="index")


@staticmethod
def check_if_mashov_file(df_students):
df_students.dropna(axis=0, how="all", inplace=True)
df_students.dropna(axis=1, how="all", inplace=True)

for col in df_students.columns:
if df_students[col].astype(str).str.match(r"(\d+.)([\u0590-\u05fe ]+)([(\u0590-\u05fe)]+)").any():
df_students.rename(columns={col: "name"}, inplace=True)
return True
return False

def mashov_file(self, df_students):
df_t = df_students.T
cols_to_drop = []
for col in df_t.columns:
if df_t[col].str.contains('|'.join(DELETE_ROWS_CONTAIN)).any():
cols_to_drop.append(col)
df_students = df_t.drop(columns=cols_to_drop).T

df_students.rename(columns={"ת.ז.": 'id_number', "כיתה": "org_class"}, inplace=True)
try:
df_students = df_students.loc[:, self._mashov_cols]
except KeyError:
raise ValueError("File content is invalid to the program configurations")

mashov_name_pattern = re.compile(r"([\u0590-\u05fe ]+)([(\u0590-\u05fe)]+)")
df_name_gender = df_students['name'].str.extract(mashov_name_pattern, expand=False)
df_students['gender'] = df_name_gender[1].str.extract("\(([\u0590-\u05fe ])\)")
df_students['gender'] = df_students['gender'].apply(self.gender_assign, gender_dict=self._gender_dict)
df_students['name'] = df_name_gender[0]
return df_students


def classic_file(self, df_students):
relevant_cols = [col for col in df_students.columns if not col.startswith("Unnamed")]
current_excel_dict = {}
for col in relevant_cols:
for key, col_options in self._file_cols_dict.items():
if col in col_options:
current_excel_dict[key] = df_students[col]
return pd.DataFrame(current_excel_dict)


@staticmethod
def gender_assign(string, gender_dict):
for key, vals in gender_dict.items():
if string in vals:
return key
return ""
34 changes: 30 additions & 4 deletions server/server/parsing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,35 @@ def create_chat_df(chat_file):

def create_students_df(file_name, file_data):
if file_name.endswith(".csv"):
df_students = pd.read_csv(file_data)
df_students = pd.read_csv(file_data, header=None)
elif file_name.endswith(".xlsx"):
df_students = pd.read_excel(file_data)
df_students = pd.read_excel(file_data, header=None)
else:
df_students = pd.read_html(file_data, header=1)[0]
return df_students
try:
df_students = pd.read_html(file_data, header=1)[0]
except ValueError:
df_students = pd.ExcelFile(file_data).parse()

clean_df = clean_student_df(df_students)
return clean_df


def clean_student_df(df_students):
# # first drop al columns that are totally missing (for extreme cases)
df_students.dropna(axis=0, how="all", inplace=True)
df_students.dropna(axis=1, how="all", inplace=True)

# check for unique values in columns - must have at list 3 unique values (min of title and 2 students
min_nunique_in_cols = max(df_students.nunique().median(), 3)
filt_relevant_cols = df_students.nunique() >= min_nunique_in_cols
df_students = df_students.loc[:, filt_relevant_cols]
df_students = pd.DataFrame(df_students.values[1:], columns=df_students.iloc[0])
return df_students


def validate_file_content(df_student): # TODO: needs to be part of the full flow as well
if df_student.shape[0] > 200:
raise ValueError("Input file have to many records") #TODO: pass amount of records as config
if df_student.empty:
raise ValueError("Entered file is empty")
return True
Empty file.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
I love to go to school
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
10:56:16 From
10:56:18 From
10:56:19 From
10:56:20 From
22 changes: 22 additions & 0 deletions server/test/files_to_test/chat_files/chat_file_valid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
10:56:16 From Tech Challenge : Attendance check:
10:56:18 From Rozanna : here
10:56:19 From Rozanna : מיכל קליימן
10:56:20 From May Steinfeld : ענבר עדי
10:56:22 From inbar shirizly : 305696031
10:56:22 From Tech Challenge : Attendance check:
10:56:22 From inbar shirizly : אביתר כהן
10:56:23 From May Steinfeld : אלעד כהן
10:56:23 From May Steinfeld : Attendance check
10:56:24 From Tammuz Dubnov : עידן אביב
10:56:24 From Daniel Kagan : here
10:56:25 From Dana Makov : here
10:58:26 From Ron Zehavi : 534234210

11:56:16 From Tech Challenge : Attendance check:

11:56:20 From May Steinfeld : אבי כהן
11:56:22 From inbar shirizly : 305696031
11:56:22 From inbar shirizly : אביתר כהן
11:56:23 From May Steinfeld : 530342413
11:56:24 From Tammuz Dubnov : עידן אביב
11:59:24 From Tammuz Dubnov : מיתר כהן
10 changes: 10 additions & 0 deletions server/test/files_to_test/students_list_excel/example_csv.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
id,phone,id_number,name,org_class,gender,class_id
11,528702484,305049421,???? ????,,,
12,524291930,123424343,????? ???,,,
13,526148959,432424455,???? ??????,,,
14,523454564,423423649,???? ???,,,
15,530342423,305696031,??? ????,,,
16,530342413,305696041,???? ??????,,,
17,537642324,534234210,???? ???,,,
,,,,,,
18,537642324,534234453,???? ???,,,
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
id,phone,id_number,name,org_class,gender,class_id,
11,528702484,305049421,???? ????,,,,
12,524291930,123424343,????? ???,,,,
13,526148959,432424455,???? ??????,,,,
14,523454564,423423649,???? ???,,,,
15,530342423,305696031,??? ????,,,,
16,530342413,305696041,???? ??????,,,,
17,537642324,534234210,???? ???,,,,
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
id,phone,id_number,name,org_class,gender,class_id,11,528702484,305049421,???? ????,,,,12,524291930,123424343,????? ???,,,,13,526148959,432424455,???? ??????,,,,14,523454564,423423649,???? ???,,,,15,530342423,305696031,??? ????,,,,16,530342413,305696041,???? ??????,,,,17,537642324,534234210,???? ???,,,,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
id,phone,id_number,name,org_class,gender,class_id,
11,528702484,305049421,???? ????,,,,
12,524291930,123424343,????? ???,,,,
13,526148959,432424455,???? ??????,,,,
14,523454564,423423649,???? ???,,,,
15,530342423,305696031,??? ????,,,,
16,530342413,305696041,???? ??????,,,,
17,537642324,534234210,???? ???,,,,
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
26 changes: 26 additions & 0 deletions server/test/test_attendance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pytest
import sys
sys.path.append('../')
from server.parsing.attendance import Attendance
from server.parsing.utils import create_chat_df, create_students_df


chat_file_path = r"C:\Users\Inbar Shirizly\Documents\python\useful\ITC_programs\zoom_attendance_check\chat files\meeting_example_full_name.txt"
excel_file_path = r"C:\Users\Inbar Shirizly\Documents\python\useful\ITC_programs\zoom_attendance_check\student_csv_examples\example_data_already_prepared.xlsx"

df_students = create_students_df(file_name=excel_file_path.split("\\")[-1], file_data=excel_file_path)


@pytest.fixture()
def chat_file():
with open(chat_file_path, "r", encoding="utf-8") as f:
chat_df = create_chat_df(f.readlines())
return chat_df



@pytest.mark.parametrize("chat_df", [chat_df])
@pytest.mark.parametrize("df_students", [df_students])
def test_first_message_time(chat_df, df_students):
report = Attendance(chat_df, df_students, ['name', "id_number", "phone"], 1, "Attendance check", ["ITC", "Tech", "Challenge"])
assert report.first_message_time.hour == 10
Loading

0 comments on commit e161ce5

Please sign in to comment.