From 4687a7813c81315037ba8da3f2a1fccfdc376e14 Mon Sep 17 00:00:00 2001 From: "Lidiane M. Gomes" Date: Thu, 22 Aug 2024 14:21:53 -0300 Subject: [PATCH] =?UTF-8?q?Exerc=C3=ADcio=20de=20an=C3=A1lise=20da=20base?= =?UTF-8?q?=20Employees?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- exercicios/para-casa/ETL_employee.py | 63 ++++++++++++++++++++++++++++ exercicios/para-sala/aula.py | 0 exercicios/para-sala/aula_by_me.py | 60 ++++++++++++++++++++++++++ 3 files changed, 123 insertions(+) create mode 100644 exercicios/para-casa/ETL_employee.py delete mode 100644 exercicios/para-sala/aula.py create mode 100644 exercicios/para-sala/aula_by_me.py diff --git a/exercicios/para-casa/ETL_employee.py b/exercicios/para-casa/ETL_employee.py new file mode 100644 index 0000000..66d85bd --- /dev/null +++ b/exercicios/para-casa/ETL_employee.py @@ -0,0 +1,63 @@ +import pandas as pd +import datetime as dt +import matplotlib.pyplot as plt + +df = pd.read_csv(r"C:\Users\Lidi\Documents\{reprograma}\Git_on33\on33-python-s10-pandas-numpy-II\material\Employee.csv") + +print(df.info()) +print(df.isnull().sum()) +print(df.duplicated().sum()) +print(df.drop_duplicates(inplace=True)) +print(df.duplicated().sum()) +print(df.info()) +print(df.describe()) + +current_year = dt.date.today().year +five_years_ago = current_year - 5 +filtered_df = df[(df["JoiningYear"] <= five_years_ago)] +print(filtered_df.describe()) + +df["Age"].value_counts().sort_index().plot(kind="barh", title="Empregados por Idade", xlabel="Quantidade", ylabel="Idades", color="goldenrod") +plt.show() + +df["Gender"].value_counts().plot(kind="pie", title="Empregados por Gênero", colors=["lightpink", "lightskyblue"], autopct="%.2f%%") +plt.ylabel("") +plt.legend() +plt.show() + +most_employees_city = df["City"].max() +print("A cidade com mais empregados é", most_employees_city) + +df["LenghtService"] = current_year - df["JoiningYear"] +lenght_service_mean_by_city = df.groupby(["City"])["LenghtService"].mean() +print(lenght_service_mean_by_city) + +total_employees = len(df) +not_working = df["LeaveOrNot"].value_counts() +not_working_pct = (not_working / total_employees) * 100 +print(f"Em {current_year} cerca de {not_working_pct[0]:.2f}% dos empregados ainda trabalham na empresa.") + +empolyees = len(df["PaymentTier"]) - not_working[1] +print(f"Atualmente existem {empolyees} empregados na empresa.") + +def convertion(value): + if value == "Yes": + return True + if value == "No": + return False + else: + return "Não categorizado" + +df["EverBenched"] = df["EverBenched"].apply(convertion) +print(df["EverBenched"]) + +# incluir a porcentagem na legenda e rótulo +df["EverBenched"].value_counts().plot(kind="pie", title="Empregados que já estiveram no banco", labels=["Não", "Sim"], colors=["peru", "sienna"], autopct="%.2f%%", explode=[0, 0.06]) +plt.ylabel("") +plt.legend() +plt.show() + +df["LeaveOrNot"].value_counts().plot(kind="pie", title="Empregados que já saíram da empresa", labels=["Permanecem na empresa", "Saíram da empresa"], colors=["seagreen", "lightgreen"], autopct="%.2f%%", explode=[0, 0.06]) +plt.ylabel("") +plt.legend() +plt.show() \ No newline at end of file diff --git a/exercicios/para-sala/aula.py b/exercicios/para-sala/aula.py deleted file mode 100644 index e69de29..0000000 diff --git a/exercicios/para-sala/aula_by_me.py b/exercicios/para-sala/aula_by_me.py new file mode 100644 index 0000000..02d74f4 --- /dev/null +++ b/exercicios/para-sala/aula_by_me.py @@ -0,0 +1,60 @@ +import pandas as pd +import matplotlib.pyplot as plt + +df = pd.read_csv("../../material/desenvolvimento_paises.csv") + +# print(df.describe()) +# print(df.info()) +# # Fazer dentro de uma coluna específica +# print(df["AveragScore"].value_counts()) + +# Altera o df original. Para não alterar, não colocar o inplace +# print(df.fillna(value=0, inplace=True)) +# print(df.isnull().sum()) +# print(df.duplicated().sum()) +# print(df.drop_duplicates(inplace=True)) +# print(df.duplicated().sum()) + +# country_greater_security_value = df["SafetySecurity"].max() +# country_lesser_security_value = df["SafetySecurity"].min() +# print(country_greater_security_value) +# print(country_lesser_security_value) +# print("A diferença entre o maior país com SafetySecurity é de: ", country_greater_security_value - country_lesser_security_value) + +# line_higher_security_value = df[df["SafetySecurity"] == country_greater_security_value] +# print(line_higher_security_value) + +# index_greater_value = df["SafetySecurity"].idxmax() +# print(df.loc[index_greater_value]) + +columns_name = df.columns +columns_name = columns_name.drop(labels="Country") + +def categorizar_valores(valor): + if valor > 80: + return "Desenvolvido" + if valor < 70 and valor >= 50: + return "Em desenvolvimento" + if valor < 60: + return "Subdesenvolvido" + else: + return "Não categorizado" + +df["Media"] = df[columns_name].mean(axis=1) +df["Categoria_Desenvolvimento"] = df["Media"].apply(categorizar_valores) + +# print(df["Media"]) +# print(df["Categoria_Desenvolvimento"].value_counts()) + +development_category_counts = df["Categoria_Desenvolvimento"].value_counts() +print(development_category_counts) + +development_category_counts.plot(kind="bar", title="QTD Países por Categoria") +# plt.xticks(rotation=45) +# plt.show() + +personel_freedom_filter = df.where((df["PersonelFreedom"] <= 30) & (df["Categoria_Desenvolvimento"] == "Subdesenvolvido")) + +personel_freedom_filter.dropna(inplace=True) + +print(personel_freedom_filter.info()) \ No newline at end of file