Skip to content

Latest commit

 

History

History
215 lines (153 loc) · 6.66 KB

File metadata and controls

215 lines (153 loc) · 6.66 KB

Graduation_Examination_Analytics

Credit: Dung Lai

You can view his course here: https://dunglailaptrinh.com/L-p-H-c-Data-Science-C-B-n-Python-c735d90b891a4351b658fff8d8cab589

Insights

  1. dfds
  2. ádasd
  3. dsasd

What I did in this dataset:

  1. Used FileIO to read data.txt
  2. Used basic techniques to extract clean data
  3. Visualized the following insights:
    • Top first names of candidates
    • Top last names of candidates
    • Avarage scores of 11 groups of ages
    • Average scores of candidates who did not sit for some subjects
    • Numbers of candidates who did not sit for some subjects

Explained approach

Import dataset

import csv
file = open("raw_data.txt", "r")
datas = file.read().split("\n")

Add header to file

with open("clean_data.csv", "w", encoding="utf8", newline ="") as file_csv:
	header = ["sbd", "tên", "dd", "mm", "yy", "toán", "ngữ văn", "khxh", "khtn", "lịch sử", "địa lí", "gdcd", "sinh học", "vật lí", "hóa học", "tiếng anh"]
	writer = csv.writer(file_csv)
	writer.writerow(header)

Cleaning process

Add 0 to sbd string and remove sbd when

sbd = 2000000
for data in datas:
    sbd += 1
    if sbd in
        continue
    sbd_str = "0" + str(sbd)

Transfer file csv into a list

data = data.split("\\n")

Remove symbol \r and \tt

    for i in range(len(data)):
        data[i] = data[i].replace("\\r","")
        data[i] = data[i].replace("\\t","")

Remove tags

    for i in range(len(data)):
        tags = []
        for j in range(len(data[i])):
            if data[i][j] == "<":
                begin =  j
            if data[i][j] == ">":
                end = j
                tags.append(data[i][begin:end+1])
        for tag in tags:
            data[i] = data[i].replace(tag,"")  

Remove leading whitespace and empty line

   for i in range(len(data)):
        data[i] = data[i].strip()
	
    emty_line =  [] 
    for i in range(len(data)):
        if data[i] != "":
            emty_line.append(data[i])
    data = emty_line 

Choose relevant information

   name = data[7]
    dob = data[8]
    scores = data[9]

Load unicode table

   chars = []
    codes = []
    file = open("unicode.txt", "r",encoding="utf8")
    unicode_table = file.read().split("\n")

    for code in unicode_table:
        x = code.split(" ")
        chars.append(x[0])
        codes.append(x[1])

Replace special characters in name and scores

   for i in range(len(chars)):
        name = name.replace(codes[i],chars[i])
        scores = scores.replace(codes[i],chars[i])

Replace web code to character in name

   for i in range(len(name)):
        if name[i:i+2] == "&#":
            name = name[:i] +chr(int(name[i+2:i+5])) + name[i+6:]

Replace web code to character in scores

    for i in range(len(scores)):
        if scores[i:i+2] == "&#":
            scores = scores[:i] +chr(int(scores[i+2:i+5])) + scores[i+6:]

Change to lower case

    name = name.lower()
    scores = scores.lower()

Split dob

   dob_list = dob.split("/")
    dd = int(dob_list[0])
    mm = int(dob_list[1])
    yy = int(dob_list[2])

Process scores

  scores = scores.replace(":", "")
    scores = scores.replace("khxh ", "khxh   ") 
    scores = scores.replace("khtn ", "khtn   ") 
    scores_list = scores.split("   ")

    data = [sbd_str,name.title(),str(dd), str(mm), str(yy)]

Add score to data

 for subject in ["toán", "ngữ văn", "khxh", "khtn", "lịch sử", "địa lí", "gdcd", "sinh học", "vật lí", "hóa học", "tiếng anh"]:
        if subject in scores_list:
            subject_name_position = scores_list.index(subject)
            subject_score_position = subject_name_position + 1
            subject_score = scores_list[subject_score_position]
            data.append(str(subject_score))
        else:
            data.append("-1")

Write data to test.txt

with open("clean_data.csv", "a", encoding="utf8",newline ="") as file_csv:
        writer = csv.writer(file_csv)
        writer.writerow(data)

Visualization

Top first_name

with open("clean_data.csv", "a", encoding="utf8",newline ="") as file_csv:
        writer = csv.writer(file_csv)
        writer.writerow(data)