Skip to content

Latest commit

 

History

History
215 lines (153 loc) · 6.66 KB

File metadata and controls

215 lines (153 loc) · 6.66 KB

Graduation_Examination_Analytics

Credit: Dung Lai

You can view his course here: https://dunglailaptrinh.com/L-p-H-c-Data-Science-C-B-n-Python-c735d90b891a4351b658fff8d8cab589

Insights

  1. dfds
  2. ádasd
  3. dsasd

What I did in this dataset:

  1. Used FileIO to read data.txt
  2. Used basic techniques to extract clean data
  3. Visualized the following insights:
    • Top first names of candidates
    • Top last names of candidates
    • Avarage scores of 11 groups of ages
    • Average scores of candidates who did not sit for some subjects
    • Numbers of candidates who did not sit for some subjects

Explained approach

Import dataset

import csv
file = open("raw_data.txt", "r")
datas = file.read().split("\n")

Add header to file

with open("clean_data.csv", "w", encoding="utf8", newline ="") as file_csv:
	header = ["sbd", "tên", "dd", "mm", "yy", "toán", "ngữ văn", "khxh", "khtn", "lịch sử", "địa lí", "gdcd", "sinh học", "vật lí", "hóa học", "tiếng anh"]
	writer = csv.writer(file_csv)
	writer.writerow(header)

Cleaning process

Add 0 to sbd string and remove sbd when

sbd = 2000000
for data in datas:
    sbd += 1
    if sbd in [2000521,2002776,2002833,2005380,2005472,2005733,2005820,2005876,2006091,2006300,2006364,2006544,2006712,2006720,2006904,2008746,2009196,2012503,2019593,2020755,2024536,2027212,2031588,2031948,2035434,2036693,2042067,2042972,2043577,2044668,2046177,2046483,2046496,2046651,2046766,2046771,2046788,2046810,2046841,2046998,2047031,2047122,2047241,2047273,2047304,2047486,2047636,2047834,2047843,2047856,2047865,2048225,2048271,2048279,2048397,2048424,2048427,2048592,2048660,2048701,2048723,2048858,2049069,2049090,2049104,2049164,2049234,2049312,2049383,2049663,2049763,2049775,2049891,2049971,2050378,2050476,2050488,2050516,2050526,2050540,2050576,2050642,2050649,2050722,2050809,2050814,2050899,2050959,2050978,2050984,2050985,2051006,2051072,2051181,2051191,2051234,2051422,2051468,2051472,2051495,2051615,2051616,2051736,2052013,2052030,2052089,2052314,2052373,2052591,2052663,2052711,2052791,2052856,2053000,2053106,2053259,2053593,2053699,2053860,2054235,2054306,2054374,2054508,2054733,2054787,2055119,2055200,2055290,2055296,2055606,2055683,2055803,2055829,2055912,2055930,2055986,2056020,2056032,2056105,2056139,2056186,2056190,2056238,2056273,2056291,2056298,2056333,2056350,2056377,2056393,2056782,2056823,2056865,2056871,2057014,2057294,2057410,2057496,2058404,2058498,2058518,2058789,2058938,2059095,2059163,2059740,2059751,2059769,2059774,2059807,2059852,2060462,2060492,2060536,2060610,2060652,2060656,2060660,2060730,2060738,2061813,2062212,2062236,2062391,2062440,2062898,2063109,2063114,2063179,2063180,2063181,2063207,2063272,2063653,2063707,2063716,2063752,2063754,2063825,2064369,2064704,2064783,2064990,2065104,2065323,2065604,2065877,2065995,2066106,2066212,2066835,2067172,2067291,2067316,2067371,2067383,2067401,2067446,2067467,2067550,2067563,2067659,2067672,2067698,2067762,2067909,2067971,2067996,2068089,2068119,2068156,2068174,2068178,2068243,2068287,2068365,2068382,2068427,2068453,2068548,2068550,2068627,2068667,2068702,2068732,2068846,2068970,2069028,2069043,2069066,2069156,2069290,2069362,2069397,2069843,2069990,2070203,2070870,2071102,2071574,2072480,2072549,2072755,2072823,2073036,2073372,2073477,2073556,2073964,2074135,2074254,2074281,2074367,2074607,2074719]:
        continue
    sbd_str = "0" + str(sbd)

Transfer file csv into a list

data = data.split("\\n")

Remove symbol \r and \tt

    for i in range(len(data)):
        data[i] = data[i].replace("\\r","")
        data[i] = data[i].replace("\\t","")

Remove tags

    for i in range(len(data)):
        tags = []
        for j in range(len(data[i])):
            if data[i][j] == "<":
                begin =  j
            if data[i][j] == ">":
                end = j
                tags.append(data[i][begin:end+1])
        for tag in tags:
            data[i] = data[i].replace(tag,"")  

Remove leading whitespace and empty line

   for i in range(len(data)):
        data[i] = data[i].strip()
	
    emty_line =  [] 
    for i in range(len(data)):
        if data[i] != "":
            emty_line.append(data[i])
    data = emty_line 

Choose relevant information

   name = data[7]
    dob = data[8]
    scores = data[9]

Load unicode table

   chars = []
    codes = []
    file = open("unicode.txt", "r",encoding="utf8")
    unicode_table = file.read().split("\n")

    for code in unicode_table:
        x = code.split(" ")
        chars.append(x[0])
        codes.append(x[1])

Replace special characters in name and scores

   for i in range(len(chars)):
        name = name.replace(codes[i],chars[i])
        scores = scores.replace(codes[i],chars[i])

Replace web code to character in name

   for i in range(len(name)):
        if name[i:i+2] == "&#":
            name = name[:i] +chr(int(name[i+2:i+5])) + name[i+6:]

Replace web code to character in scores

    for i in range(len(scores)):
        if scores[i:i+2] == "&#":
            scores = scores[:i] +chr(int(scores[i+2:i+5])) + scores[i+6:]

Change to lower case

    name = name.lower()
    scores = scores.lower()

Split dob

   dob_list = dob.split("/")
    dd = int(dob_list[0])
    mm = int(dob_list[1])
    yy = int(dob_list[2])

Process scores

  scores = scores.replace(":", "")
    scores = scores.replace("khxh ", "khxh   ") 
    scores = scores.replace("khtn ", "khtn   ") 
    scores_list = scores.split("   ")

    data = [sbd_str,name.title(),str(dd), str(mm), str(yy)]

Add score to data

 for subject in ["toán", "ngữ văn", "khxh", "khtn", "lịch sử", "địa lí", "gdcd", "sinh học", "vật lí", "hóa học", "tiếng anh"]:
        if subject in scores_list:
            subject_name_position = scores_list.index(subject)
            subject_score_position = subject_name_position + 1
            subject_score = scores_list[subject_score_position]
            data.append(str(subject_score))
        else:
            data.append("-1")

Write data to test.txt

with open("clean_data.csv", "a", encoding="utf8",newline ="") as file_csv:
        writer = csv.writer(file_csv)
        writer.writerow(data)

Visualization

Top first_name

with open("clean_data.csv", "a", encoding="utf8",newline ="") as file_csv:
        writer = csv.writer(file_csv)
        writer.writerow(data)