From 0a1d71d107e738c0fbdfed8064113579cd0ec601 Mon Sep 17 00:00:00 2001 From: ariapn Date: Tue, 20 Feb 2018 10:21:09 -0600 Subject: [PATCH] Add files via upload --- Pandas_Basics.ipynb | 519 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 519 insertions(+) create mode 100644 Pandas_Basics.ipynb diff --git a/Pandas_Basics.ipynb b/Pandas_Basics.ipynb new file mode 100644 index 0000000..8896532 --- /dev/null +++ b/Pandas_Basics.ipynb @@ -0,0 +1,519 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
birthdayname
010-Jan-1980Jessen H
12/28/85Vic A
216.01.1975 00:00:00Linden L
\n", + "
" + ], + "text/plain": [ + " birthday name\n", + "0 10-Jan-1980 Jessen H \n", + "1 2/28/85 Vic A \n", + "2 16.01.1975 00:00:00 Linden L " + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.DataFrame({'name':[' Jessen H ', ' Vic A ', ' Linden L '], 'birthday':['10-Jan-1980', '2/28/85', '16.01.1975 00:00:00']})\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Jessen H \n" + ] + } + ], + "source": [ + "print (df.loc[0, 'name'])" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 3 entries, 0 to 2\n", + "Data columns (total 2 columns):\n", + "birthday 3 non-null object\n", + "name 3 non-null object\n", + "dtypes: object(2)\n", + "memory usage: 128.0+ bytes\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def strip_str(s):\n", + " return s.strip()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": {}, + "outputs": [], + "source": [ + "#strip whitespaces on name column\n", + "df['name'] = df['name'].apply(lambda x:strip_str(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#df['name'] = df['name'].astype(str).apply(lambda x:x.strip())" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def get_first_name(name):\n", + " return name.split()[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#create first_name column from name column\n", + "df['first_name'] = df['name'].apply(lambda x:get_first_name(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "df['last_name'] = df['name'].apply(lambda x:x.split()[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
birthdaynamefirst_namelast_name
010-Jan-1980Jessen HJessenH
12/28/85Vic AVicA
216.01.1975 00:00:00Linden LLindenL
\n", + "
" + ], + "text/plain": [ + " birthday name first_name last_name\n", + "0 10-Jan-1980 Jessen H Jessen H\n", + "1 2/28/85 Vic A Vic A\n", + "2 16.01.1975 00:00:00 Linden L Linden L" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Jessen H\n" + ] + } + ], + "source": [ + "print (df.loc[0, 'name'])" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "datetime.strptime?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Datetime format\n", + "%b - abbreviated month name, Jan, Dec\n", + "\n", + "%B - full month name\n", + "\n", + "%d - day of the month (01 to 31), also works without 0 padding, 1-31\n", + "\n", + "%D - same as %m/%d/%y\n", + "\n", + "%H - hour, using a 24-hour clock (00 to 23)\n", + "\n", + "%I - hour, using a 12-hour clock (01 to 12)\n", + "\n", + "%m - month (01 to 12)\n", + "\n", + "%M - minute\n", + "\n", + "%S - second\n", + "\n", + "%T - current time, equal to %H:%M:%S\n", + "\n", + "%y - year without a century (range 00 to 99)\n", + "\n", + "%Y - year including the century" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "datetime.datetime" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from datetime import datetime\n", + "#10-Jan-1980\n", + "dt = datetime.strptime('10-Jan-1980', '%d-%b-%Y')\n", + "type(dt)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [ + { + "ename": "ValueError", + "evalue": "time data '2/3/85' does not match format '%d/%m/%Y'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0;31m#2/28/85\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;31m#should be '%d/%m/%y'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mdt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrptime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'2/3/85'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'%d/%m/%Y'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", + "\u001b[0;32m/opt/conda/lib/python3.6/_strptime.py\u001b[0m in \u001b[0;36m_strptime_datetime\u001b[0;34m(cls, data_string, format)\u001b[0m\n\u001b[1;32m 563\u001b[0m \"\"\"Return a class cls instance based on the input string and the\n\u001b[1;32m 564\u001b[0m format string.\"\"\"\n\u001b[0;32m--> 565\u001b[0;31m \u001b[0mtt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfraction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_strptime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 566\u001b[0m \u001b[0mtzname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgmtoff\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 567\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfraction\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/opt/conda/lib/python3.6/_strptime.py\u001b[0m in \u001b[0;36m_strptime\u001b[0;34m(data_string, format)\u001b[0m\n\u001b[1;32m 360\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 361\u001b[0m raise ValueError(\"time data %r does not match format %r\" %\n\u001b[0;32m--> 362\u001b[0;31m (data_string, format))\n\u001b[0m\u001b[1;32m 363\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 364\u001b[0m raise ValueError(\"unconverted data remains: %s\" %\n", + "\u001b[0;31mValueError\u001b[0m: time data '2/3/85' does not match format '%d/%m/%Y'" + ] + } + ], + "source": [ + "#2/28/85\n", + "#should be '%d/%m/%y'\n", + "dt = datetime.strptime('2/3/85', '%d/%m/%Y')" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from datetime import datetime\n", + "def convert_date(date_str):\n", + " '''\n", + " date format: https://www.tutorialspoint.com/python/time_strptime.htm\n", + " existing formats:\n", + " 28-Jun-1989\n", + " 18.07.2012 00:00:00\n", + " 3/25/2013\n", + " 3/4/2014 0:00\n", + " '''\n", + " try:\n", + " #28-Jun-1989\n", + " dt = datetime.strptime(date_str, '%d-%b-%Y')\n", + " return dt#.strftime('%Y-%m-%d')\n", + " except:\n", + " pass\n", + " try:\n", + " #3/25/18\n", + " dt = datetime.strptime(date_str, '%m/%d/%y')\n", + " return dt#.strftime('%Y-%m-%d')\n", + " except:\n", + " pass\n", + " try:\n", + " #28-Jun-89\n", + " dt = datetime.strptime(date_str, '%d-%b-%y')\n", + " return dt#.strftime('%Y-%m-%d')\n", + " except:\n", + " pass\n", + " try:\n", + " #18.07.2012 00:00:00\n", + " dt = datetime.strptime(date_str, '%d.%m.%Y %H:%M:%S')\n", + " return dt#.strftime('%Y-%m-%d')\n", + " except:\n", + " pass\n", + " try:\n", + " #2/18/1980\n", + " dt = datetime.strptime(date_str, '%m/%d/%Y')\n", + " return dt#.strftime('%Y-%m-%d')\n", + " except:\n", + " pass\n", + " try:\n", + " #2/18/1980 00:00\n", + " dt = datetime.strptime(date_str, '%m/%d/%Y %H:%M')\n", + " return dt#.strftime('%Y-%m-%d')\n", + " except:\n", + " pass\n", + " print (date_str)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df['birthday'] = df['birthday'].apply(lambda x:convert_date(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
birthdaynamefirst_namelast_name
01980-01-10Jessen HJessenH
11985-02-28Vic AVicA
21975-01-16Linden LLindenL
\n", + "
" + ], + "text/plain": [ + " birthday name first_name last_name\n", + "0 1980-01-10 Jessen H Jessen H\n", + "1 1985-02-28 Vic A Vic A\n", + "2 1975-01-16 Linden L Linden L" + ] + }, + "execution_count": 68, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "pandas.tslib.Timestamp" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "type(df.loc[0,'birthday'])" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.1" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}