From 0a1d71d107e738c0fbdfed8064113579cd0ec601 Mon Sep 17 00:00:00 2001
From: ariapn <ariapn@gmail.com>
Date: Tue, 20 Feb 2018 10:21:09 -0600
Subject: [PATCH] Add files via upload

---
 Pandas_Basics.ipynb | 519 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 519 insertions(+)
 create mode 100644 Pandas_Basics.ipynb
diff --git a/Pandas_Basics.ipynb b/Pandas_Basics.ipynb
new file mode 100644
index 0000000..8896532
--- /dev/null
+++ b/Pandas_Basics.ipynb
@@ -0,0 +1,519 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {
+    "scrolled": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>birthday</th>\n",
+       "      <th>name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>10-Jan-1980</td>\n",
+       "      <td>Jessen H</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2/28/85</td>\n",
+       "      <td>Vic A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>16.01.1975 00:00:00</td>\n",
+       "      <td>Linden L</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              birthday          name\n",
+       "0          10-Jan-1980    Jessen H  \n",
+       "1              2/28/85        Vic A \n",
+       "2  16.01.1975 00:00:00     Linden L "
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df = pd.DataFrame({'name':['  Jessen H  ', ' Vic A ', ' Linden L '], 'birthday':['10-Jan-1980', '2/28/85', '16.01.1975 00:00:00']})\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Jessen H  \n"
+     ]
+    }
+   ],
+   "source": [
+    "print (df.loc[0, 'name'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 3 entries, 0 to 2\n",
+      "Data columns (total 2 columns):\n",
+      "birthday    3 non-null object\n",
+      "name        3 non-null object\n",
+      "dtypes: object(2)\n",
+      "memory usage: 128.0+ bytes\n"
+     ]
+    }
+   ],
+   "source": [
+    "df.info()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def strip_str(s):\n",
+    "    return s.strip()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#strip whitespaces on name column\n",
+    "df['name'] = df['name'].apply(lambda x:strip_str(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#df['name'] = df['name'].astype(str).apply(lambda x:x.strip())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def get_first_name(name):\n",
+    "    return name.split()[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "#create first_name column from name column\n",
+    "df['first_name'] = df['name'].apply(lambda x:get_first_name(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df['last_name'] = df['name'].apply(lambda x:x.split()[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>birthday</th>\n",
+       "      <th>name</th>\n",
+       "      <th>first_name</th>\n",
+       "      <th>last_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>10-Jan-1980</td>\n",
+       "      <td>Jessen H</td>\n",
+       "      <td>Jessen</td>\n",
+       "      <td>H</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>2/28/85</td>\n",
+       "      <td>Vic A</td>\n",
+       "      <td>Vic</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>16.01.1975 00:00:00</td>\n",
+       "      <td>Linden L</td>\n",
+       "      <td>Linden</td>\n",
+       "      <td>L</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "              birthday      name first_name last_name\n",
+       "0          10-Jan-1980  Jessen H     Jessen         H\n",
+       "1              2/28/85     Vic A        Vic         A\n",
+       "2  16.01.1975 00:00:00  Linden L     Linden         L"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Jessen H\n"
+     ]
+    }
+   ],
+   "source": [
+    "print (df.loc[0, 'name'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "datetime.strptime?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Datetime format\n",
+    "%b - abbreviated month name, Jan, Dec\n",
+    "\n",
+    "%B - full month name\n",
+    "\n",
+    "%d - day of the month (01 to 31), also works without 0 padding, 1-31\n",
+    "\n",
+    "%D - same as %m/%d/%y\n",
+    "\n",
+    "%H - hour, using a 24-hour clock (00 to 23)\n",
+    "\n",
+    "%I - hour, using a 12-hour clock (01 to 12)\n",
+    "\n",
+    "%m - month (01 to 12)\n",
+    "\n",
+    "%M - minute\n",
+    "\n",
+    "%S - second\n",
+    "\n",
+    "%T - current time, equal to %H:%M:%S\n",
+    "\n",
+    "%y - year without a century (range 00 to 99)\n",
+    "\n",
+    "%Y - year including the century"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "datetime.datetime"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from datetime import datetime\n",
+    "#10-Jan-1980\n",
+    "dt = datetime.strptime('10-Jan-1980', '%d-%b-%Y')\n",
+    "type(dt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ValueError",
+     "evalue": "time data '2/3/85' does not match format '%d/%m/%Y'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-65-521a15e3850f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0;31m#2/28/85\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;31m#should be '%d/%m/%y'\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mdt\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdatetime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrptime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'2/3/85'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'%d/%m/%Y'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
+      "\u001b[0;32m/opt/conda/lib/python3.6/_strptime.py\u001b[0m in \u001b[0;36m_strptime_datetime\u001b[0;34m(cls, data_string, format)\u001b[0m\n\u001b[1;32m    563\u001b[0m     \"\"\"Return a class cls instance based on the input string and the\n\u001b[1;32m    564\u001b[0m     format string.\"\"\"\n\u001b[0;32m--> 565\u001b[0;31m     \u001b[0mtt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfraction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_strptime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    566\u001b[0m     \u001b[0mtzname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgmtoff\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    567\u001b[0m     \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfraction\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/opt/conda/lib/python3.6/_strptime.py\u001b[0m in \u001b[0;36m_strptime\u001b[0;34m(data_string, format)\u001b[0m\n\u001b[1;32m    360\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    361\u001b[0m         raise ValueError(\"time data %r does not match format %r\" %\n\u001b[0;32m--> 362\u001b[0;31m                          (data_string, format))\n\u001b[0m\u001b[1;32m    363\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    364\u001b[0m         raise ValueError(\"unconverted data remains: %s\" %\n",
+      "\u001b[0;31mValueError\u001b[0m: time data '2/3/85' does not match format '%d/%m/%Y'"
+     ]
+    }
+   ],
+   "source": [
+    "#2/28/85\n",
+    "#should be '%d/%m/%y'\n",
+    "dt = datetime.strptime('2/3/85', '%d/%m/%Y')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from datetime import datetime\n",
+    "def convert_date(date_str):\n",
+    "    '''\n",
+    "    date format: https://www.tutorialspoint.com/python/time_strptime.htm\n",
+    "    existing formats:\n",
+    "    28-Jun-1989\n",
+    "    18.07.2012 00:00:00\n",
+    "    3/25/2013\n",
+    "    3/4/2014 0:00\n",
+    "    '''\n",
+    "    try:\n",
+    "        #28-Jun-1989\n",
+    "        dt = datetime.strptime(date_str, '%d-%b-%Y')\n",
+    "        return dt#.strftime('%Y-%m-%d')\n",
+    "    except:\n",
+    "        pass\n",
+    "    try:\n",
+    "        #3/25/18\n",
+    "        dt = datetime.strptime(date_str, '%m/%d/%y')\n",
+    "        return dt#.strftime('%Y-%m-%d')\n",
+    "    except:\n",
+    "        pass\n",
+    "    try:\n",
+    "        #28-Jun-89\n",
+    "        dt = datetime.strptime(date_str, '%d-%b-%y')\n",
+    "        return dt#.strftime('%Y-%m-%d')\n",
+    "    except:\n",
+    "        pass\n",
+    "    try:\n",
+    "        #18.07.2012 00:00:00\n",
+    "        dt = datetime.strptime(date_str, '%d.%m.%Y %H:%M:%S')\n",
+    "        return dt#.strftime('%Y-%m-%d')\n",
+    "    except:\n",
+    "        pass\n",
+    "    try:\n",
+    "        #2/18/1980\n",
+    "        dt = datetime.strptime(date_str, '%m/%d/%Y')\n",
+    "        return dt#.strftime('%Y-%m-%d')\n",
+    "    except:\n",
+    "        pass\n",
+    "    try:\n",
+    "        #2/18/1980 00:00\n",
+    "        dt = datetime.strptime(date_str, '%m/%d/%Y %H:%M')\n",
+    "        return dt#.strftime('%Y-%m-%d')\n",
+    "    except:\n",
+    "        pass\n",
+    "    print (date_str)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "df['birthday'] = df['birthday'].apply(lambda x:convert_date(x))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>birthday</th>\n",
+       "      <th>name</th>\n",
+       "      <th>first_name</th>\n",
+       "      <th>last_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1980-01-10</td>\n",
+       "      <td>Jessen H</td>\n",
+       "      <td>Jessen</td>\n",
+       "      <td>H</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1985-02-28</td>\n",
+       "      <td>Vic A</td>\n",
+       "      <td>Vic</td>\n",
+       "      <td>A</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1975-01-16</td>\n",
+       "      <td>Linden L</td>\n",
+       "      <td>Linden</td>\n",
+       "      <td>L</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    birthday      name first_name last_name\n",
+       "0 1980-01-10  Jessen H     Jessen         H\n",
+       "1 1985-02-28     Vic A        Vic         A\n",
+       "2 1975-01-16  Linden L     Linden         L"
+      ]
+     },
+     "execution_count": 68,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "pandas.tslib.Timestamp"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(df.loc[0,'birthday'])"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	birthday	name
0	10-Jan-1980	Jessen H
1	2/28/85	Vic A
2	16.01.1975 00:00:00	Linden L
	birthday	name	first_name	last_name
0	10-Jan-1980	Jessen H	Jessen	H
1	2/28/85	Vic A	Vic	A
2	16.01.1975 00:00:00	Linden L	Linden	L
	birthday	name	first_name	last_name
0	1980-01-10	Jessen H	Jessen	H
1	1985-02-28	Vic A	Vic	A
2	1975-01-16	Linden L	Linden	L