-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_scraper.py
162 lines (127 loc) · 6.22 KB
/
test_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
from sked_parser.scraper import (
create_id,
extract_semester,
optimize_label,
guess_degree,
)
def test_extract_semester_normal():
"""Test normal/default string"""
sem_str = "Angewandte Informatik - 1. Semester"
assert extract_semester(sem_str, "") == 1
def test_extract_semester_multiple_numbers():
"""Test string with other numbers"""
sem_str = "Wasser- und Bodenmanagement - PO 2018 - 3. Semester"
assert extract_semester(sem_str, "") == 3
sem_str = "Wasser- und Bodenmanagement - 20. Sem"
assert extract_semester(sem_str, "") is None
def test_extract_semester_abbrevation():
"""Test string with shortened Sem."""
sem_str = "1. Sem. EIT"
assert extract_semester(sem_str, "") == 1
def test_extract_semester_no_delimiter():
"""Test that strings with no delimiter or non-digit after number match as well"""
sem_str = "IVG_1_1.Sem"
assert extract_semester(sem_str, "") == 1
sem_str = "1 Sem Informatik"
assert extract_semester(sem_str, "") == 1
def test_extract_semester_no_semester():
"""Test string without semester returns a string, not an int"""
sem_str = "IMES Teilzeit 2018"
assert extract_semester(sem_str, "") is None
def test_extract_semester_duplicated_sem():
"""Test string with duplicated semesters"""
sem_str = "Soziale Arbeit - 5. Semester - PO 2018 - 5. Semester - Soziale Arbeit"
assert extract_semester(sem_str, "") == 5
def test_extract_semester_multiple_semesters():
"""Test that in case of multiple semesters, only the last one is returned (for now)"""
sem_str = "Bio- und Umwelttechnik (BEE ) - 3. - 4. Semester"
assert extract_semester(sem_str, "") == 4
def test_extract_semester_wahlpflicht():
"""Test that "Wahlpflichtfächer" are correctly identified"""
sem_str = "Management im Gesundheitswesen Wahlpflichtangebot"
assert extract_semester(sem_str, "") == "WPF"
# Also make sure to return WPF even if a semester is given
sem_str = "Wahlpflichtangebot (WPF MIG) 5. und höhere Sem."
assert extract_semester(sem_str, "") == "WPF"
def test_extract_semester_url_fallback():
"""Test that URL parsing is used when no desc is provided"""
url_str = "i/Semester/Semester-Liste/I-B.Sc. WI 1. Sem..html"
assert extract_semester("Nothing in here", url_str) == 1
def test_extract_semester_url_digit_at_end():
"""Test that URL parsing with digit at end works"""
url_str = "r/studentenset/23-03-r-b-rfs-2.html"
assert extract_semester("Nothing in here", url_str) == 2
def test_extract_semester_url_no_digit_in_middle():
url_str = "r/studentenset/23-2-r-b-rfs.html"
assert extract_semester("Nothing in here", url_str) is None
def test_extract_semester_fachsemester_string():
sem_str = "2. Fachsemester Smart Vehicle Systems"
assert extract_semester(sem_str, "") == 2
def test_optimize_label_strip_semester():
"""Verify the semester is correctly stripped from the label"""
# Semester at end
in_str = "Bauingenieurwesen - 1. Semester"
assert optimize_label(in_str, False) == "Bauingenieurwesen"
# Semester at start
in_str = "4. Semester Servicetechnik und Prozesse"
assert optimize_label(in_str, False) == "Servicetechnik und Prozesse"
# Duplicated / multiple semester strings
in_str = "5. Semester - PO 2018 - 5. Semester - Handel"
assert optimize_label(in_str, False) == "PO 2018 - Handel"
# Multiple semesters in one substring
in_str = "Umwelttechnik - 3. - 4. Semester"
assert optimize_label(in_str, False) == "Umwelttechnik"
# Fachsemester statt Semester
in_str = "1. Fachsemester Smart Vehicle Systems"
assert optimize_label(in_str, False) == "Smart Vehicle Systems"
# Semester shorthand used
in_str = "Wirtschaftsinformatik 5. Sem."
assert optimize_label(in_str, False) == "Wirtschaftsinformatik"
# Even shorter semester shorthand used
in_str = "Wirtschaftsinformatik 5 Sem"
assert optimize_label(in_str, False) == "Wirtschaftsinformatik"
# Weird duplicated semester string with CSV at end
in_str = "WI_4_4. Sem..csv"
assert optimize_label(in_str, False) == "WI"
def test_optimize_label_shorthand_strip():
"""Verify that the shorthand is correctly used instead of the longform if requested"""
# Simple shorthand and text after it
in_str = "Energie- und Gebäudetechnik (EGT) - TGA"
assert optimize_label(in_str, True) == "EGT - TGA"
# Shorthand with special chars and extra whitespace
in_str = "Energie- und Gebäudetechnik ( EGT / EGTiP ) - TGA"
assert optimize_label(in_str, True) == "EGT / EGTiP - TGA"
# Shorthand string with numbers in it should not be replaced/used
in_str = "Vertiefung CE (PO18)"
assert optimize_label(in_str, True) == "Vertiefung CE (PO18)"
def test_extract_id():
"""Verify that the ID extraction works correctly"""
faculty_short = "e"
current_sem_str = "ws"
extracted_semester = 1
def sked_path(part_str):
return f"e/semester/{part_str}.html"
# Simple string
in_str = "eit"
assert create_id(sked_path(in_str), faculty_short, current_sem_str, extracted_semester) == "e_eit_1_ws"
# Dot at end
in_str = "eit."
assert create_id(sked_path(in_str), faculty_short, current_sem_str, extracted_semester) == "e_eit_1_ws"
# Duplicated semester
in_str = "RPP_1_1. Sem"
assert create_id(sked_path(in_str), faculty_short, current_sem_str, extracted_semester) == "e_rpp_1_ws"
# Faculty shortname already present
in_str = "E-eit"
assert create_id(sked_path(in_str), faculty_short, current_sem_str, extracted_semester) == "e_eit_1_ws"
# Mutliple special chars
in_str = "b-.-eit"
assert create_id(sked_path(in_str), faculty_short, current_sem_str, extracted_semester) == "e_b_eit_1_ws"
# Special URL for faculty E
assert create_id("e/E-IST.html", faculty_short, current_sem_str, extracted_semester) == "e_ist_1_ws"
# Complicated semester specification
in_str = "PSA_M_1. Semester_Schwerpunkt"
assert create_id(sked_path(in_str), faculty_short, current_sem_str, extracted_semester) == "e_psa_m_schwerpunkt_1_ws"
def test_is_master():
def sked_path(part_str):
return f"e/semester/{part_str}.html"
assert guess_degree("", sked_path("b_stgrp_ma_glob_1")) == "Master"