Merge pull request #3 from datosgobar/dev-nacho

Dev nacho
datosgobar · Jun 13, 2017 · 3bfe582 · 3bfe582
2 parents f9725b1 + c5180c2
commit 3bfe582
Show file tree

Hide file tree

Showing 5 changed files with 301 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -36,6 +36,9 @@ var/
 pip-log.txt
 pip-delete-this-directory.txt
 
+#Test de performance
+tests/data/performance_data/
+
 # Unit test / coverage reports
 htmlcov/
 .tox/

diff --git a/tests/Pruebas de Performance Textar.ipynb b/tests/Pruebas de Performance Textar.ipynb
@@ -0,0 +1,230 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext memory_profiler\n",
+    "from textar import TextClassifier\n",
+    "import xml.etree.ElementTree as ET\n",
+    "from lxml import etree\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper funcs\n",
+    "\n",
+    "def parse_blog(tree, min_words=100):\n",
+    "    dates = []\n",
+    "    posts = []\n",
+    "    for elem in tree:\n",
+    "        post = None\n",
+    "        if elem.tag == 'date':\n",
+    "            date = elem.text\n",
+    "        elif elem.tag == 'post':\n",
+    "            post = elem.text\n",
+    "        if post is not None: \n",
+    "            words = re.findall('\\w+\\W',post)\n",
+    "            if len(words) > min_words and np.mean(map(len,words))>2:\n",
+    "                dates.append(date)\n",
+    "                posts.append(post)\n",
+    "    return dates, posts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Configs\n",
+    "DATA_FOLDER = os.path.join('.','data','performance_data','blogs')\n",
+    "MAX_FILES = 10000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "magic = '''<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n",
+    "            \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\" [\n",
+    "            <!ENTITY nbsp ' '>\n",
+    "            ]>'''\n",
+    "\n",
+    "parser = etree.XMLParser(recover=True)\n",
+    "\n",
+    "all_dates = []\n",
+    "all_posts = []\n",
+    "all_genders = []\n",
+    "all_ages = []\n",
+    "all_categories = []\n",
+    "\n",
+    "for file_name in os.listdir(DATA_FOLDER)[:MAX_FILES]:\n",
+    "    id_f, gender, age, category, zodiac, ext = file_name.split('.')\n",
+    "    with open(os.path.join(DATA_FOLDER, file_name), 'r') as f:\n",
+    "        try:\n",
+    "            tree = ET.fromstring(magic + f.read(), parser=parser)\n",
+    "            dates, posts = parse_blog(tree)\n",
+    "            all_posts += posts\n",
+    "            all_dates += dates\n",
+    "            all_genders += [gender] * len(dates)\n",
+    "            all_ages += [age] * len(dates)\n",
+    "            all_categories += [category] * len(dates)\n",
+    "        except Exception as e:\n",
+    "            pass\n",
+    "            #print(\"Error en {:s}\".format(file_name))\n",
+    "all_ids = map(str, range(len(all_posts)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%timeit\n",
+    "# Tiempo de la creacion del objeto\n",
+    "tc = TextClassifier(all_posts, all_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 loop, best of 3: 2.36 s per loop\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "# Tiempo de la busqueda\n",
+    "tc.get_similar(all_ids[1],max_similars=3, term_diff_max_rank=50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 loop, best of 3: 17.4 s per loop\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "# Tiempo de creacion del clasificador\n",
+    "tc.make_classifier(\"topic\",all_ids, all_categories)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 loops, best of 3: 31.4 ms per loop\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "tc.classify(\"topic\", all_ids[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "row.toarray()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['0', '1', '10', ..., '997', '998', '999'], \n",
+       "      dtype='|S4')"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [Root]",
+   "language": "python",
+   "name": "Python [Root]"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  },
+  "notify_time": "5"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/tests/test_text_classifier.py b/tests/test_text_classifier.py
@@ -11,10 +11,9 @@
 import os
 import codecs
 import numpy as np
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.datasets import fetch_20newsgroups
 sys.path.insert(0, os.path.abspath('..'))
-
 from textar import TextClassifier
 
 
@@ -32,7 +31,7 @@ def test_get_similar(self):
                 "El edificio más antiguo tiene muchas cuadros caros porque era de un multimillonario",
                 "El edificio más moderno tiene muchas programadoras que comen manzanas durante el almuerzo grupal"
             ],
-            ids=map(str, range(4))
+            ids=list(map(str, range(4)))
         )
 
         ids, distancias, palabras_comunes = tc.get_similar(
@@ -42,10 +41,14 @@ def test_get_similar(self):
 
         self.assertEqual(ids, ['0', '3', '2', '1'])
         self.assertEqual(
-            palabras_comunes,
+            [
+                sorted(palabras)
+                for palabras in palabras_comunes
+            ]
+            ,
             [
                 [u'edificio', u'manzanas'],
-                [u'edificio', u'muchas', u'manzanas'],
+                [u'edificio', u'manzanas', u'muchas'],
                 [u'edificio', u'muchas'], [u'muchas']
             ]
         )
@@ -60,13 +63,13 @@ def test_classify(self):
                 "Para hacer una torta de naranja se necesita harina, huevos, leche, ralladura de naranja y polvo de hornear",
                 "Para hacer un lemon pie se necesita crema, ralladura de limón, huevos, leche y harina"
             ],
-            ids=map(str, range(6))
+            ids=list(map(str, range(6)))
         )
 
         # entrena un clasificador
         tc.make_classifier(
             name="recetas_classifier",
-            ids=map(str, range(6)),
+            ids=list(map(str, range(6))),
             labels=["Comida", "Comida", "Trago", "Trago", "Postre", "Postre"]
         )
 

diff --git a/textar/__init__.py b/textar/__init__.py
@@ -4,4 +4,4 @@
 __email__ = '[email protected]'
 __version__ = '0.0.4'
 
-from text_classifier import TextClassifier
+from .text_classifier import TextClassifier