diff --git a/eda.ipynb b/eda.ipynb new file mode 100644 index 0000000..b0c55ad --- /dev/null +++ b/eda.ipynb @@ -0,0 +1,1287 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from wordcloud import WordCloud, STOPWORDS" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Get the CSV file from google drive" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "url='https://drive.google.com/file/d/1QuvhMiZLka18ZXnx8o1P5C8Cf5oHCgjL/view?usp=sharing'\n", + "url='https://drive.google.com/uc?id=' + url.split('/')[-2]\n", + "df_movies_orig = pd.read_csv(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df_movies = df_movies_orig.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | Unnamed: 0 | \n", + "budget | \n", + "id | \n", + "original_language | \n", + "overview | \n", + "popularity | \n", + "release_date | \n", + "revenue | \n", + "runtime | \n", + "status | \n", + "... | \n", + "vote_count | \n", + "return | \n", + "belongs_to_collection_name | \n", + "genres_name | \n", + "production_companies_name | \n", + "production_countries_name | \n", + "spoken_languages_name | \n", + "release_year | \n", + "cast | \n", + "director | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "0 | \n", + "30000000.0 | \n", + "862 | \n", + "en | \n", + "Led by Woody, Andy's toys live happily in his ... | \n", + "21.946943 | \n", + "1995-10-30 | \n", + "373554033.0 | \n", + "81.0 | \n", + "Released | \n", + "... | \n", + "5415.0 | \n", + "12.451801 | \n", + "Toy Story Collection | \n", + "['Animation', 'Comedy', 'Family', '', '', '', ... | \n", + "['Pixar Animation Studios', '', '', '', '', ''... | \n", + "['United States of America', '', '', '', '', '... | \n", + "['English', '', '', '', '', '', '', '', '', ''... | \n", + "1995 | \n", + "['Tom Hanks', 'Tim Allen', 'Don Rickles', 'Jim... | \n", + "John Lasseter | \n", + "
1 | \n", + "1 | \n", + "65000000.0 | \n", + "8844 | \n", + "en | \n", + "When siblings Judy and Peter discover an encha... | \n", + "17.015539 | \n", + "1995-12-15 | \n", + "262797249.0 | \n", + "104.0 | \n", + "Released | \n", + "... | \n", + "2413.0 | \n", + "4.043035 | \n", + "NaN | \n", + "['Adventure', 'Fantasy', 'Family', '', '', '',... | \n", + "['TriStar Pictures', 'Teitler Film', 'Intersco... | \n", + "['United States of America', '', '', '', '', '... | \n", + "['English', 'Français', '', '', '', '', '', ''... | \n", + "1995 | \n", + "['Robin Williams', 'Jonathan Hyde', 'Kirsten D... | \n", + "Larry J. Franco | \n", + "
2 | \n", + "2 | \n", + "0.0 | \n", + "15602 | \n", + "en | \n", + "A family wedding reignites the ancient feud be... | \n", + "11.712900 | \n", + "1995-12-22 | \n", + "0.0 | \n", + "101.0 | \n", + "Released | \n", + "... | \n", + "92.0 | \n", + "0.000000 | \n", + "Grumpy Old Men Collection | \n", + "['Romance', 'Comedy', '', '', '', '', '', ''] | \n", + "['Warner Bros.', 'Lancaster Gate', '', '', '',... | \n", + "['United States of America', '', '', '', '', '... | \n", + "['English', '', '', '', '', '', '', '', '', ''... | \n", + "1995 | \n", + "['Walter Matthau', 'Jack Lemmon', 'Ann-Margret... | \n", + "Howard Deutch | \n", + "
3 | \n", + "3 | \n", + "16000000.0 | \n", + "31357 | \n", + "en | \n", + "Cheated on, mistreated and stepped on, the wom... | \n", + "3.859495 | \n", + "1995-12-22 | \n", + "81452156.0 | \n", + "127.0 | \n", + "Released | \n", + "... | \n", + "34.0 | \n", + "5.090760 | \n", + "NaN | \n", + "['Comedy', 'Drama', 'Romance', '', '', '', '',... | \n", + "['Twentieth Century Fox Film Corporation', '',... | \n", + "['United States of America', '', '', '', '', '... | \n", + "['English', '', '', '', '', '', '', '', '', ''... | \n", + "1995 | \n", + "['Whitney Houston', 'Angela Bassett', 'Loretta... | \n", + "Forest Whitaker | \n", + "
4 | \n", + "4 | \n", + "0.0 | \n", + "11862 | \n", + "en | \n", + "Just when George Banks has recovered from his ... | \n", + "8.387519 | \n", + "1995-02-10 | \n", + "76578911.0 | \n", + "106.0 | \n", + "Released | \n", + "... | \n", + "173.0 | \n", + "0.000000 | \n", + "Father of the Bride Collection | \n", + "['Comedy', '', '', '', '', '', '', ''] | \n", + "['Sandollar Productions', 'Touchstone Pictures... | \n", + "['United States of America', '', '', '', '', '... | \n", + "['English', '', '', '', '', '', '', '', '', ''... | \n", + "1995 | \n", + "['Steve Martin', 'Diane Keaton', 'Martin Short... | \n", + "Alan Silvestri | \n", + "
5 rows × 22 columns
\n", + "\n", + " | budget | \n", + "popularity | \n", + "revenue | \n", + "vote_average | \n", + "vote_count | \n", + "return | \n", + "release_year | \n", + "
---|---|---|---|---|---|---|---|
count | \n", + "4.534500e+04 | \n", + "45345.000000 | \n", + "4.534500e+04 | \n", + "45345.000000 | \n", + "45345.000000 | \n", + "4.534500e+04 | \n", + "45345.000000 | \n", + "
mean | \n", + "4.232673e+06 | \n", + "2.926248 | \n", + "1.123390e+07 | \n", + "5.624320 | \n", + "110.137722 | \n", + "6.604937e+02 | \n", + "1991.882280 | \n", + "
std | \n", + "1.744391e+07 | \n", + "6.011023 | \n", + "6.441058e+07 | \n", + "1.915178 | \n", + "491.904428 | \n", + "7.471882e+04 | \n", + "24.053016 | \n", + "
min | \n", + "0.000000e+00 | \n", + "0.000000 | \n", + "0.000000e+00 | \n", + "0.000000 | \n", + "0.000000 | \n", + "0.000000e+00 | \n", + "1874.000000 | \n", + "
25% | \n", + "0.000000e+00 | \n", + "0.388826 | \n", + "0.000000e+00 | \n", + "5.000000 | \n", + "3.000000 | \n", + "0.000000e+00 | \n", + "1978.000000 | \n", + "
50% | \n", + "0.000000e+00 | \n", + "1.130269 | \n", + "0.000000e+00 | \n", + "6.000000 | \n", + "10.000000 | \n", + "0.000000e+00 | \n", + "2001.000000 | \n", + "
75% | \n", + "0.000000e+00 | \n", + "3.689610 | \n", + "0.000000e+00 | \n", + "6.800000 | \n", + "34.000000 | \n", + "0.000000e+00 | \n", + "2010.000000 | \n", + "
max | \n", + "3.800000e+08 | \n", + "547.488298 | \n", + "2.787965e+09 | \n", + "10.000000 | \n", + "14075.000000 | \n", + "1.239638e+07 | \n", + "2020.000000 | \n", + "
\n", + " | 0 | \n", + "1 | \n", + "2 | \n", + "3 | \n", + "4 | \n", + "5 | \n", + "6 | \n", + "7 | \n", + "8 | \n", + "
---|---|---|---|---|---|---|---|---|---|
0 | \n", + "Animation | \n", + "Comedy | \n", + "Family | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "
1 | \n", + "Adventure | \n", + "Fantasy | \n", + "Family | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "
2 | \n", + "Romance | \n", + "Comedy | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "
3 | \n", + "Comedy | \n", + "Drama | \n", + "Romance | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "
4 | \n", + "Comedy | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "None | \n", + "
\n", + " | title | \n", + "genres_name | \n", + "popularity | \n", + "vote_average | \n", + "Western | \n", + "Drama | \n", + "History | \n", + "Comedy | \n", + "Animation | \n", + "Mystery | \n", + "... | \n", + "TV | \n", + "Crime | \n", + "Documentary | \n", + "Action | \n", + "Romance | \n", + "Fantasy | \n", + "Science | \n", + "Fiction | \n", + "Adventure | \n", + "Movie | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "Toy Story | \n", + "Animation Comedy Family | \n", + "21.946943 | \n", + "7.7 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "1 | \n", + "0 | \n", + "... | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "
1 | \n", + "Jumanji | \n", + "Adventure Fantasy Family | \n", + "17.015539 | \n", + "6.9 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "... | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "
2 | \n", + "Grumpier Old Men | \n", + "Romance Comedy | \n", + "11.712900 | \n", + "6.5 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "... | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "
3 | \n", + "Waiting to Exhale | \n", + "Comedy Drama Romance | \n", + "3.859495 | \n", + "6.1 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "... | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "
4 | \n", + "Father of the Bride Part II | \n", + "Comedy | \n", + "8.387519 | \n", + "5.7 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "1 | \n", + "0 | \n", + "0 | \n", + "... | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "
5 rows × 27 columns
\n", + "