From 433d23176a57368bf4321a98ce67dc4de60e3965 Mon Sep 17 00:00:00 2001
From: meliascosta <m.eliascosta@gmail.com>
Date: Wed, 12 Apr 2017 16:14:22 -0300
Subject: [PATCH 01/10] Agrego opcion para buscar textos solo en algunos ids

Se agrega el argumento `filter_list` a la funcion `get_similars` que contiene la lista de ids de textos en los que se desea buscar
---
 textar/text_classifier.py | 44 ++++++++++++++++++++++++++++-----------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/textar/text_classifier.py b/textar/text_classifier.py
index 00da330..ebd0c0d 100644
--- a/textar/text_classifier.py
+++ b/textar/text_classifier.py
@@ -10,8 +10,8 @@
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.linear_model import SGDClassifier
-from sklearn.svm import LinearSVC
-from scipy import sparse
+from sklearn.svm import LinearSVC  # No se usa mas Quitar
+from scipy import sparse  # No se usa mas quitar
 import pandas as pd
 import numpy as np
 import os
@@ -168,7 +168,7 @@ def _make_text_vectors(self, examples):
         return textvec
 
     def get_similar(self, example, max_similars=3, similarity_cutoff=None,
-                    term_diff_cutoff=0.6):
+                    term_diff_cutoff=0.6, filter_list=None):
         """Devuelve textos similares al ejemplo dentro de los textos entrenados.
 
         Nota:
@@ -186,6 +186,8 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
                 a la hora de recuperar textos (no afecta el funcionamiento de
                 que textos se consideran cercanos, solo la cantidad de terminos
                 que se devuelven en best_words).
+            filter_list (list): Lista de ids de textos en la cual buscar textos
+                similares.
 
         Returns:
             tuple (list, list, list): (text_ids, sorted_dist, best_words)
@@ -197,13 +199,23 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
                     palabras mas relevantes que se usaron para seleccionar esa
                     sugerencia.
         """
-        if max_similars > self.term_mat.shape[0]:
+
+        if filter_list:
+            if max_similars > len(filter_list):
+                raise ValueError("No se pueden pedir mas sugerencias que la \
+                                  cantidad de textos en `filter_list`.")
+            else:
+                filt_idx = np.in1d(self.ids, filter_list)
+        elif max_similars > self.term_mat.shape[0]:
             raise ValueError("No se pueden pedir mas sugerencias que la \
                               cantidad de textos que hay almacenados.")
+        else:
+            filt_idx = slice(None)
+
         if example in self.ids:
             index = self.ids == example
             exmpl_vec = self.tfidf_mat[index, :]
-            distances = np.squeeze(pairwise_distances(self.tfidf_mat,
+            distances = np.squeeze(pairwise_distances(self.tfidf_mat[filt_idx],
                                                       exmpl_vec))
             # Pongo la distancia a si mismo como inf, par que no se devuelva a
             # si mismo como una opcion
@@ -211,7 +223,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
         else:
             exmpl_vec = self.vectorizer.transform([example])  # contar terminos
             exmpl_vec = self.transformer.transform(exmpl_vec)  # calcular tfidf
-            distances = np.squeeze(pairwise_distances(self.tfidf_mat,
+            distances = np.squeeze(pairwise_distances(self.tfidf_mat[filt_idx],
                                                       exmpl_vec))
         sorted_indices = np.argsort(distances)
         closest_n = sorted_indices[:max_similars]
@@ -221,19 +233,27 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
             sorted_dist = sorted_dist[sorted_dist < similarity_cutoff]
         best_words = []
         exmpl_vec = exmpl_vec.toarray()
+        # Calculo palabras relevantes para cada sugerencia
         for suggested in closest_n:
             test_vec = self.tfidf_mat[suggested, :].toarray()
-            differences = np.abs(exmpl_vec - test_vec)**2 / \
-                (exmpl_vec**2 + test_vec**2)
+            differences = np.abs(exmpl_vec - test_vec)**2
+            normalization = exmpl_vec**2 + test_vec**2
+            differences[normalization > 0] = differences[normalization > 0] / \
+                normalization[normalization > 0]
+            differences[normalization == 0] = np.inf
             differences = np.squeeze(np.array(differences))
             sort_I = np.argsort(differences)
-            limit = np.flatnonzero((differences[sort_I] > term_diff_cutoff)
-                                   | (np.isnan(differences[sort_I]))
+            limit = np.flatnonzero((differences[sort_I] > term_diff_cutoff) |
+                                   (np.isnan(differences[sort_I]))
                                    )[0]
             best_words.append([k for k, v in
                                self.vectorizer.vocabulary_.iteritems()
                                if v in sort_I[:limit]])
-        text_ids = self.ids[closest_n]
+        if filter_list:
+            filt_idx_to_general_idx = np.flatnonzero(filt_idx)
+            text_ids = self.ids[filt_idx_to_general_idx[closest_n]]
+        else:
+            text_ids = self.ids[closest_n]
         return list(text_ids), list(sorted_dist), best_words
 
     def reload_texts(self, texts, ids, vocabulary=None):
@@ -307,5 +327,5 @@ def _check_id_length(self, ids):
             ingresado textos planos en lugar de ids.")
 
     def _check_repeated_ids(self, ids):
-        if length(np.unique(ids)) != length(ids):
+        if len(np.unique(ids)) != len(ids):
             raise ValueError("Hay ids repetidos.")

From bc59174fdfb78c9414842c55aa50a696719a5aa8 Mon Sep 17 00:00:00 2001
From: meliascosta <m.eliascosta@gmail.com>
Date: Wed, 19 Apr 2017 16:35:59 -0300
Subject: [PATCH 02/10] Corrijo error en la forma en que se asignaba distancia
 infinito de la query a si misma

---
 textar/text_classifier.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/textar/text_classifier.py b/textar/text_classifier.py
index ebd0c0d..bb82c1e 100644
--- a/textar/text_classifier.py
+++ b/textar/text_classifier.py
@@ -219,7 +219,10 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
                                                       exmpl_vec))
             # Pongo la distancia a si mismo como inf, par que no se devuelva a
             # si mismo como una opcion
-            distances[index] = np.inf
+            if filter_list and example in filter_list:
+                distances[filter_list.index(example)] = np.inf
+            elif not filter_list:
+                distances[index] = np.inf
         else:
             exmpl_vec = self.vectorizer.transform([example])  # contar terminos
             exmpl_vec = self.transformer.transform(exmpl_vec)  # calcular tfidf

From 82a36fc0f9273cfd44af5cb1bad3202f9b96785f Mon Sep 17 00:00:00 2001
From: meliascosta <m.eliascosta@gmail.com>
Date: Tue, 25 Apr 2017 13:40:53 -0300
Subject: [PATCH 03/10] Cambios en la forma de sugerir palabras para resaltar

---
 .gitignore                                |   3 +
 tests/Pruebas de Performance Textar.ipynb | 230 ++++++++++++++++++++++
 textar/text_classifier.py                 |  26 +--
 3 files changed, 243 insertions(+), 16 deletions(-)
 create mode 100644 tests/Pruebas de Performance Textar.ipynb

diff --git a/.gitignore b/.gitignore
index 352a19d..2f083b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,9 @@ var/
 pip-log.txt
 pip-delete-this-directory.txt
 
+#Test de performance
+tests/data/performance_data/
+
 # Unit test / coverage reports
 htmlcov/
 .tox/
diff --git a/tests/Pruebas de Performance Textar.ipynb b/tests/Pruebas de Performance Textar.ipynb
new file mode 100644
index 0000000..d0f9cdd
--- /dev/null
+++ b/tests/Pruebas de Performance Textar.ipynb	
@@ -0,0 +1,230 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext memory_profiler\n",
+    "from textar import TextClassifier\n",
+    "import xml.etree.ElementTree as ET\n",
+    "from lxml import etree\n",
+    "import numpy as np\n",
+    "import re\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Helper funcs\n",
+    "\n",
+    "def parse_blog(tree, min_words=100):\n",
+    "    dates = []\n",
+    "    posts = []\n",
+    "    for elem in tree:\n",
+    "        post = None\n",
+    "        if elem.tag == 'date':\n",
+    "            date = elem.text\n",
+    "        elif elem.tag == 'post':\n",
+    "            post = elem.text\n",
+    "        if post is not None: \n",
+    "            words = re.findall('\\w+\\W',post)\n",
+    "            if len(words) > min_words and np.mean(map(len,words))>2:\n",
+    "                dates.append(date)\n",
+    "                posts.append(post)\n",
+    "    return dates, posts"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Configs\n",
+    "DATA_FOLDER = os.path.join('.','data','performance_data','blogs')\n",
+    "MAX_FILES = 10000"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "magic = '''<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n",
+    "            \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\" [\n",
+    "            <!ENTITY nbsp ' '>\n",
+    "            ]>'''\n",
+    "\n",
+    "parser = etree.XMLParser(recover=True)\n",
+    "\n",
+    "all_dates = []\n",
+    "all_posts = []\n",
+    "all_genders = []\n",
+    "all_ages = []\n",
+    "all_categories = []\n",
+    "\n",
+    "for file_name in os.listdir(DATA_FOLDER)[:MAX_FILES]:\n",
+    "    id_f, gender, age, category, zodiac, ext = file_name.split('.')\n",
+    "    with open(os.path.join(DATA_FOLDER, file_name), 'r') as f:\n",
+    "        try:\n",
+    "            tree = ET.fromstring(magic + f.read(), parser=parser)\n",
+    "            dates, posts = parse_blog(tree)\n",
+    "            all_posts += posts\n",
+    "            all_dates += dates\n",
+    "            all_genders += [gender] * len(dates)\n",
+    "            all_ages += [age] * len(dates)\n",
+    "            all_categories += [category] * len(dates)\n",
+    "        except Exception as e:\n",
+    "            pass\n",
+    "            #print(\"Error en {:s}\".format(file_name))\n",
+    "all_ids = map(str, range(len(all_posts)))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%timeit\n",
+    "# Tiempo de la creacion del objeto\n",
+    "tc = TextClassifier(all_posts, all_ids)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 loop, best of 3: 2.36 s per loop\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "# Tiempo de la busqueda\n",
+    "tc.get_similar(all_ids[1],max_similars=3, term_diff_max_rank=50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1 loop, best of 3: 17.4 s per loop\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "# Tiempo de creacion del clasificador\n",
+    "tc.make_classifier(\"topic\",all_ids, all_categories)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 50,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "10 loops, best of 3: 31.4 ms per loop\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "tc.classify(\"topic\", all_ids[1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([[ 0.,  0.,  0., ...,  0.,  0.,  0.]])"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "row.toarray()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array(['0', '1', '10', ..., '997', '998', '999'], \n",
+       "      dtype='|S4')"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python [Root]",
+   "language": "python",
+   "name": "Python [Root]"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.12"
+  },
+  "notify_time": "5"
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/textar/text_classifier.py b/textar/text_classifier.py
index bb82c1e..8a69d9d 100644
--- a/textar/text_classifier.py
+++ b/textar/text_classifier.py
@@ -47,7 +47,7 @@ def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'):
             input='content', encoding=encoding, decode_error='strict',
             strip_accents='ascii', lowercase=True, preprocessor=None,
             tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1),
-            analyzer='word', max_df=1.0, min_df=1, max_features=None,
+            analyzer='word', max_df=0.8, min_df=1, max_features=None,
             vocabulary=vocabulary, binary=False)
 
         self.transformer = TfidfTransformer()
@@ -168,7 +168,7 @@ def _make_text_vectors(self, examples):
         return textvec
 
     def get_similar(self, example, max_similars=3, similarity_cutoff=None,
-                    term_diff_cutoff=0.6, filter_list=None):
+                    term_diff_max_rank=10, filter_list=None):
         """Devuelve textos similares al ejemplo dentro de los textos entrenados.
 
         Nota:
@@ -181,7 +181,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
                 devolver.
             similarity_cutoff (float, optional): Valor umbral de similaridad
                 para definir que dos textos son similares entre si.
-            term_diff_cutoff (float, optional): Este valor sirve para controlar
+            term_diff_max_rank (int, optional): Este valor sirve para controlar
                 el umbral con el que los terminos son considerados importantes
                 a la hora de recuperar textos (no afecta el funcionamiento de
                 que textos se consideran cercanos, solo la cantidad de terminos
@@ -235,23 +235,17 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
             closest_n = closest_n[sorted_dist < similarity_cutoff]
             sorted_dist = sorted_dist[sorted_dist < similarity_cutoff]
         best_words = []
-        exmpl_vec = exmpl_vec.toarray()
         # Calculo palabras relevantes para cada sugerencia
         for suggested in closest_n:
-            test_vec = self.tfidf_mat[suggested, :].toarray()
-            differences = np.abs(exmpl_vec - test_vec)**2
-            normalization = exmpl_vec**2 + test_vec**2
-            differences[normalization > 0] = differences[normalization > 0] / \
-                normalization[normalization > 0]
-            differences[normalization == 0] = np.inf
-            differences = np.squeeze(np.array(differences))
-            sort_I = np.argsort(differences)
-            limit = np.flatnonzero((differences[sort_I] > term_diff_cutoff) |
-                                   (np.isnan(differences[sort_I]))
-                                   )[0]
+            best_example = np.squeeze(exmpl_vec.toarray())
+            best_example = np.flipud(
+                np.argsort(best_example))[:term_diff_max_rank]
+            test_vec = np.squeeze(self.tfidf_mat[suggested, :].toarray())
+            best_test = np.flipud(np.argsort(test_vec))[:term_diff_max_rank]
+            best_words_ids = np.intersect1d(best_example, best_test)
             best_words.append([k for k, v in
                                self.vectorizer.vocabulary_.iteritems()
-                               if v in sort_I[:limit]])
+                               if v in best_words_ids])
         if filter_list:
             filt_idx_to_general_idx = np.flatnonzero(filt_idx)
             text_ids = self.ids[filt_idx_to_general_idx[closest_n]]

From ad4336d666cfe08c0478373b76d46e46b4436304 Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <ignacio.nh@gmail.com>
Date: Tue, 16 May 2017 17:54:20 -0300
Subject: [PATCH 04/10] Fix para python3

---
 textar/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/textar/__init__.py b/textar/__init__.py
index 7688f12..e669d40 100644
--- a/textar/__init__.py
+++ b/textar/__init__.py
@@ -4,4 +4,4 @@
 __email__ = 'datos@modernizacion.gob.ar'
 __version__ = '0.0.4'
 
-from text_classifier import TextClassifier
+from .text_classifier import TextClassifier

From cf08feb8e5ac42a1bac0af5e6354ca8e83621526 Mon Sep 17 00:00:00 2001
From: meliascosta <m.eliascosta@gmail.com>
Date: Fri, 19 May 2017 11:51:46 -0300
Subject: [PATCH 05/10] Fix de integracion con python 3

---
 textar/text_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/textar/text_classifier.py b/textar/text_classifier.py
index 8a69d9d..1c4117d 100644
--- a/textar/text_classifier.py
+++ b/textar/text_classifier.py
@@ -244,7 +244,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
             best_test = np.flipud(np.argsort(test_vec))[:term_diff_max_rank]
             best_words_ids = np.intersect1d(best_example, best_test)
             best_words.append([k for k, v in
-                               self.vectorizer.vocabulary_.iteritems()
+                               self.vectorizer.vocabulary_.items()
                                if v in best_words_ids])
         if filter_list:
             filt_idx_to_general_idx = np.flatnonzero(filt_idx)

From 92a572679d6765d2bd9a73464855a31596257cf4 Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <ignacio.nh@gmail.com>
Date: Tue, 30 May 2017 17:02:43 -0300
Subject: [PATCH 06/10] cambios para python3

---
 textar/text_classifier.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/textar/text_classifier.py b/textar/text_classifier.py
index 8a69d9d..d867ed3 100644
--- a/textar/text_classifier.py
+++ b/textar/text_classifier.py
@@ -1,4 +1,3 @@
-# -*- coding: utf-8 -*-
 u"""Módulo de clasificación de textos.
 
 Este módulo contiene a los objetos que permiten entrenar un clasificador
@@ -10,8 +9,6 @@
 from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
 from sklearn.metrics.pairwise import pairwise_distances
 from sklearn.linear_model import SGDClassifier
-from sklearn.svm import LinearSVC  # No se usa mas Quitar
-from scipy import sparse  # No se usa mas quitar
 import pandas as pd
 import numpy as np
 import os
@@ -103,7 +100,7 @@ def retrain(self, name, ids, labels):
         except AttributeError:
             raise AttributeError("No hay ningun clasificador con ese nombre.")
         indices = np.in1d(self.ids, ids)
-        if isinstance(labels, basestring):
+        if isinstance(labels, str):
             labels = [labels]
         classifier.partial_fit(self.tfidf_mat[indices, :], labels)
 
@@ -147,7 +144,7 @@ def _make_text_vectors(self, examples):
                 El tamaño de la matriz es de (N, T) donde N es la cantidad de
                 ejemplos y T es la cantidad de términos en el vocabulario.
         """
-        if isinstance(examples, basestring):
+        if isinstance(examples, str):
             if examples in self.ids:
                 textvec = self.tfidf_mat[self.ids == examples, :]
             else:
@@ -244,7 +241,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
             best_test = np.flipud(np.argsort(test_vec))[:term_diff_max_rank]
             best_words_ids = np.intersect1d(best_example, best_test)
             best_words.append([k for k, v in
-                               self.vectorizer.vocabulary_.iteritems()
+                               self.vectorizer.vocabulary_.items()
                                if v in best_words_ids])
         if filter_list:
             filt_idx_to_general_idx = np.flatnonzero(filt_idx)

From abca8b9a305688c59b1eccff8e18e58507fde1d0 Mon Sep 17 00:00:00 2001
From: meliascosta <m.eliascosta@gmail.com>
Date: Tue, 13 Jun 2017 15:07:43 -0300
Subject: [PATCH 07/10] Arreglo Bug de textos compuestos enteramente por
 stopwords

De paso corrijo la sugerencia de palabras para truncar el maximo de terminos para textos con pocas palabras.
---
 textar/text_classifier.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/textar/text_classifier.py b/textar/text_classifier.py
index 1c4117d..a9eb11d 100644
--- a/textar/text_classifier.py
+++ b/textar/text_classifier.py
@@ -206,12 +206,15 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
                                   cantidad de textos en `filter_list`.")
             else:
                 filt_idx = np.in1d(self.ids, filter_list)
+
         elif max_similars > self.term_mat.shape[0]:
             raise ValueError("No se pueden pedir mas sugerencias que la \
                               cantidad de textos que hay almacenados.")
         else:
-            filt_idx = slice(None)
-
+            filt_idx = np.ones(len(self.ids), dtype=bool)
+        # Saco los textos compuestos solo por stop_words
+        good_ids = np.array(np.sum(self.term_mat, 1) > 0).squeeze()
+        filt_idx = filt_idx & good_ids
         if example in self.ids:
             index = self.ids == example
             exmpl_vec = self.tfidf_mat[index, :]
@@ -222,12 +225,17 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
             if filter_list and example in filter_list:
                 distances[filter_list.index(example)] = np.inf
             elif not filter_list:
-                distances[index] = np.inf
+                idx_example = np.searchsorted(self.ids, example)
+                filt_idx_example = np.searchsorted(np.flatnonzero(filt_idx),
+                                                   idx_example)
+                distances[filt_idx_example] = np.inf
         else:
             exmpl_vec = self.vectorizer.transform([example])  # contar terminos
             exmpl_vec = self.transformer.transform(exmpl_vec)  # calcular tfidf
             distances = np.squeeze(pairwise_distances(self.tfidf_mat[filt_idx],
                                                       exmpl_vec))
+        if np.sum(exmpl_vec) == 0:
+            return [], [], []
         sorted_indices = np.argsort(distances)
         closest_n = sorted_indices[:max_similars]
         sorted_dist = distances[closest_n]
@@ -236,12 +244,16 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
             sorted_dist = sorted_dist[sorted_dist < similarity_cutoff]
         best_words = []
         # Calculo palabras relevantes para cada sugerencia
+        best_example = np.squeeze(exmpl_vec.toarray())
+        sorted_example_weights = np.flipud(np.argsort(best_example))
+        truncated_max_rank = min(term_diff_max_rank, np.sum(best_example > 0))
+        best_example = sorted_example_weights[:truncated_max_rank]
         for suggested in closest_n:
-            best_example = np.squeeze(exmpl_vec.toarray())
-            best_example = np.flipud(
-                np.argsort(best_example))[:term_diff_max_rank]
             test_vec = np.squeeze(self.tfidf_mat[suggested, :].toarray())
-            best_test = np.flipud(np.argsort(test_vec))[:term_diff_max_rank]
+            sorted_test_weights = np.flipud(np.argsort(test_vec))
+            truncated_max_rank = min(term_diff_max_rank,
+                                     np.sum(test_vec > 0))
+            best_test = sorted_test_weights[:truncated_max_rank]
             best_words_ids = np.intersect1d(best_example, best_test)
             best_words.append([k for k, v in
                                self.vectorizer.vocabulary_.items()

From 8ab3f5c671bd77095a5411ca7e83b464191d7154 Mon Sep 17 00:00:00 2001
From: meliascosta <m.eliascosta@gmail.com>
Date: Tue, 13 Jun 2017 15:15:47 -0300
Subject: [PATCH 08/10] Agrego DeprecationWarning

Es para un parametro de las sugerencias que ya no se usa mas
---
 textar/text_classifier.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/textar/text_classifier.py b/textar/text_classifier.py
index bba3708..30488e9 100644
--- a/textar/text_classifier.py
+++ b/textar/text_classifier.py
@@ -165,7 +165,8 @@ def _make_text_vectors(self, examples):
         return textvec
 
     def get_similar(self, example, max_similars=3, similarity_cutoff=None,
-                    term_diff_max_rank=10, filter_list=None):
+                    term_diff_max_rank=10, filter_list=None,
+                    term_diff_cutoff=None):
         """Devuelve textos similares al ejemplo dentro de los textos entrenados.
 
         Nota:
@@ -185,6 +186,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
                 que se devuelven en best_words).
             filter_list (list): Lista de ids de textos en la cual buscar textos
                 similares.
+            term_diff_cutoff (float): Deprecado. Se quitara en el futuro.
 
         Returns:
             tuple (list, list, list): (text_ids, sorted_dist, best_words)
@@ -197,6 +199,9 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None,
                     sugerencia.
         """
 
+        if term_diff_cutoff:
+            warnings.warn('Deprecado. Quedo sin uso. Se quitara en el futuro.',
+                          DeprecationWarning)
         if filter_list:
             if max_similars > len(filter_list):
                 raise ValueError("No se pueden pedir mas sugerencias que la \

From 718463d52d1a933988e16c8dc8e7f21a0ee867d8 Mon Sep 17 00:00:00 2001
From: meliascosta <m.eliascosta@gmail.com>
Date: Tue, 13 Jun 2017 15:35:10 -0300
Subject: [PATCH 09/10] Agrego encoding y cambio import de sklearn

---
 tests/test_text_classifier.py | 3 +--
 textar/text_classifier.py     | 2 ++
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/tests/test_text_classifier.py b/tests/test_text_classifier.py
index 0a24663..394e6d6 100644
--- a/tests/test_text_classifier.py
+++ b/tests/test_text_classifier.py
@@ -11,10 +11,9 @@
 import os
 import codecs
 import numpy as np
-from sklearn.cross_validation import train_test_split
+from sklearn.model_selection import train_test_split
 from sklearn.datasets import fetch_20newsgroups
 sys.path.insert(0, os.path.abspath('..'))
-
 from textar import TextClassifier
 
 
diff --git a/textar/text_classifier.py b/textar/text_classifier.py
index 30488e9..429cced 100644
--- a/textar/text_classifier.py
+++ b/textar/text_classifier.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 u"""Módulo de clasificación de textos.
 
 Este módulo contiene a los objetos que permiten entrenar un clasificador

From 346f1a57d235cc1b1e6a2e1ad21714b616847c8b Mon Sep 17 00:00:00 2001
From: Ignacio Heredia <ignacio.nh@gmail.com>
Date: Tue, 13 Jun 2017 15:52:44 -0300
Subject: [PATCH 10/10] Modifico tests para python3 y cambio assert de igualdad
 de resultados

---
 tests/test_text_classifier.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/tests/test_text_classifier.py b/tests/test_text_classifier.py
index 0a24663..25fc61a 100644
--- a/tests/test_text_classifier.py
+++ b/tests/test_text_classifier.py
@@ -32,7 +32,7 @@ def test_get_similar(self):
                 "El edificio más antiguo tiene muchas cuadros caros porque era de un multimillonario",
                 "El edificio más moderno tiene muchas programadoras que comen manzanas durante el almuerzo grupal"
             ],
-            ids=map(str, range(4))
+            ids=list(map(str, range(4)))
         )
 
         ids, distancias, palabras_comunes = tc.get_similar(
@@ -42,10 +42,14 @@ def test_get_similar(self):
 
         self.assertEqual(ids, ['0', '3', '2', '1'])
         self.assertEqual(
-            palabras_comunes,
+            [
+                sorted(palabras)
+                for palabras in palabras_comunes
+            ]
+            ,
             [
                 [u'edificio', u'manzanas'],
-                [u'edificio', u'muchas', u'manzanas'],
+                [u'edificio', u'manzanas', u'muchas'],
                 [u'edificio', u'muchas'], [u'muchas']
             ]
         )
@@ -60,13 +64,13 @@ def test_classify(self):
                 "Para hacer una torta de naranja se necesita harina, huevos, leche, ralladura de naranja y polvo de hornear",
                 "Para hacer un lemon pie se necesita crema, ralladura de limón, huevos, leche y harina"
             ],
-            ids=map(str, range(6))
+            ids=list(map(str, range(6)))
         )
 
         # entrena un clasificador
         tc.make_classifier(
             name="recetas_classifier",
-            ids=map(str, range(6)),
+            ids=list(map(str, range(6))),
             labels=["Comida", "Comida", "Trago", "Trago", "Postre", "Postre"]
         )