From 433d23176a57368bf4321a98ce67dc4de60e3965 Mon Sep 17 00:00:00 2001 From: meliascosta Date: Wed, 12 Apr 2017 16:14:22 -0300 Subject: [PATCH 01/10] Agrego opcion para buscar textos solo en algunos ids Se agrega el argumento `filter_list` a la funcion `get_similars` que contiene la lista de ids de textos en los que se desea buscar --- textar/text_classifier.py | 44 ++++++++++++++++++++++++++++----------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/textar/text_classifier.py b/textar/text_classifier.py index 00da330..ebd0c0d 100644 --- a/textar/text_classifier.py +++ b/textar/text_classifier.py @@ -10,8 +10,8 @@ from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.metrics.pairwise import pairwise_distances from sklearn.linear_model import SGDClassifier -from sklearn.svm import LinearSVC -from scipy import sparse +from sklearn.svm import LinearSVC # No se usa mas Quitar +from scipy import sparse # No se usa mas quitar import pandas as pd import numpy as np import os @@ -168,7 +168,7 @@ def _make_text_vectors(self, examples): return textvec def get_similar(self, example, max_similars=3, similarity_cutoff=None, - term_diff_cutoff=0.6): + term_diff_cutoff=0.6, filter_list=None): """Devuelve textos similares al ejemplo dentro de los textos entrenados. Nota: @@ -186,6 +186,8 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, a la hora de recuperar textos (no afecta el funcionamiento de que textos se consideran cercanos, solo la cantidad de terminos que se devuelven en best_words). + filter_list (list): Lista de ids de textos en la cual buscar textos + similares. Returns: tuple (list, list, list): (text_ids, sorted_dist, best_words) @@ -197,13 +199,23 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, palabras mas relevantes que se usaron para seleccionar esa sugerencia. """ - if max_similars > self.term_mat.shape[0]: + + if filter_list: + if max_similars > len(filter_list): + raise ValueError("No se pueden pedir mas sugerencias que la \ + cantidad de textos en `filter_list`.") + else: + filt_idx = np.in1d(self.ids, filter_list) + elif max_similars > self.term_mat.shape[0]: raise ValueError("No se pueden pedir mas sugerencias que la \ cantidad de textos que hay almacenados.") + else: + filt_idx = slice(None) + if example in self.ids: index = self.ids == example exmpl_vec = self.tfidf_mat[index, :] - distances = np.squeeze(pairwise_distances(self.tfidf_mat, + distances = np.squeeze(pairwise_distances(self.tfidf_mat[filt_idx], exmpl_vec)) # Pongo la distancia a si mismo como inf, par que no se devuelva a # si mismo como una opcion @@ -211,7 +223,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, else: exmpl_vec = self.vectorizer.transform([example]) # contar terminos exmpl_vec = self.transformer.transform(exmpl_vec) # calcular tfidf - distances = np.squeeze(pairwise_distances(self.tfidf_mat, + distances = np.squeeze(pairwise_distances(self.tfidf_mat[filt_idx], exmpl_vec)) sorted_indices = np.argsort(distances) closest_n = sorted_indices[:max_similars] @@ -221,19 +233,27 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, sorted_dist = sorted_dist[sorted_dist < similarity_cutoff] best_words = [] exmpl_vec = exmpl_vec.toarray() + # Calculo palabras relevantes para cada sugerencia for suggested in closest_n: test_vec = self.tfidf_mat[suggested, :].toarray() - differences = np.abs(exmpl_vec - test_vec)**2 / \ - (exmpl_vec**2 + test_vec**2) + differences = np.abs(exmpl_vec - test_vec)**2 + normalization = exmpl_vec**2 + test_vec**2 + differences[normalization > 0] = differences[normalization > 0] / \ + normalization[normalization > 0] + differences[normalization == 0] = np.inf differences = np.squeeze(np.array(differences)) sort_I = np.argsort(differences) - limit = np.flatnonzero((differences[sort_I] > term_diff_cutoff) - | (np.isnan(differences[sort_I])) + limit = np.flatnonzero((differences[sort_I] > term_diff_cutoff) | + (np.isnan(differences[sort_I])) )[0] best_words.append([k for k, v in self.vectorizer.vocabulary_.iteritems() if v in sort_I[:limit]]) - text_ids = self.ids[closest_n] + if filter_list: + filt_idx_to_general_idx = np.flatnonzero(filt_idx) + text_ids = self.ids[filt_idx_to_general_idx[closest_n]] + else: + text_ids = self.ids[closest_n] return list(text_ids), list(sorted_dist), best_words def reload_texts(self, texts, ids, vocabulary=None): @@ -307,5 +327,5 @@ def _check_id_length(self, ids): ingresado textos planos en lugar de ids.") def _check_repeated_ids(self, ids): - if length(np.unique(ids)) != length(ids): + if len(np.unique(ids)) != len(ids): raise ValueError("Hay ids repetidos.") From bc59174fdfb78c9414842c55aa50a696719a5aa8 Mon Sep 17 00:00:00 2001 From: meliascosta Date: Wed, 19 Apr 2017 16:35:59 -0300 Subject: [PATCH 02/10] Corrijo error en la forma en que se asignaba distancia infinito de la query a si misma --- textar/text_classifier.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/textar/text_classifier.py b/textar/text_classifier.py index ebd0c0d..bb82c1e 100644 --- a/textar/text_classifier.py +++ b/textar/text_classifier.py @@ -219,7 +219,10 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, exmpl_vec)) # Pongo la distancia a si mismo como inf, par que no se devuelva a # si mismo como una opcion - distances[index] = np.inf + if filter_list and example in filter_list: + distances[filter_list.index(example)] = np.inf + elif not filter_list: + distances[index] = np.inf else: exmpl_vec = self.vectorizer.transform([example]) # contar terminos exmpl_vec = self.transformer.transform(exmpl_vec) # calcular tfidf From 82a36fc0f9273cfd44af5cb1bad3202f9b96785f Mon Sep 17 00:00:00 2001 From: meliascosta Date: Tue, 25 Apr 2017 13:40:53 -0300 Subject: [PATCH 03/10] Cambios en la forma de sugerir palabras para resaltar --- .gitignore | 3 + tests/Pruebas de Performance Textar.ipynb | 230 ++++++++++++++++++++++ textar/text_classifier.py | 26 +-- 3 files changed, 243 insertions(+), 16 deletions(-) create mode 100644 tests/Pruebas de Performance Textar.ipynb diff --git a/.gitignore b/.gitignore index 352a19d..2f083b6 100644 --- a/.gitignore +++ b/.gitignore @@ -36,6 +36,9 @@ var/ pip-log.txt pip-delete-this-directory.txt +#Test de performance +tests/data/performance_data/ + # Unit test / coverage reports htmlcov/ .tox/ diff --git a/tests/Pruebas de Performance Textar.ipynb b/tests/Pruebas de Performance Textar.ipynb new file mode 100644 index 0000000..d0f9cdd --- /dev/null +++ b/tests/Pruebas de Performance Textar.ipynb @@ -0,0 +1,230 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext memory_profiler\n", + "from textar import TextClassifier\n", + "import xml.etree.ElementTree as ET\n", + "from lxml import etree\n", + "import numpy as np\n", + "import re\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [], + "source": [ + "# Helper funcs\n", + "\n", + "def parse_blog(tree, min_words=100):\n", + " dates = []\n", + " posts = []\n", + " for elem in tree:\n", + " post = None\n", + " if elem.tag == 'date':\n", + " date = elem.text\n", + " elif elem.tag == 'post':\n", + " post = elem.text\n", + " if post is not None: \n", + " words = re.findall('\\w+\\W',post)\n", + " if len(words) > min_words and np.mean(map(len,words))>2:\n", + " dates.append(date)\n", + " posts.append(post)\n", + " return dates, posts" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Configs\n", + "DATA_FOLDER = os.path.join('.','data','performance_data','blogs')\n", + "MAX_FILES = 10000" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "magic = '''\n", + " ]>'''\n", + "\n", + "parser = etree.XMLParser(recover=True)\n", + "\n", + "all_dates = []\n", + "all_posts = []\n", + "all_genders = []\n", + "all_ages = []\n", + "all_categories = []\n", + "\n", + "for file_name in os.listdir(DATA_FOLDER)[:MAX_FILES]:\n", + " id_f, gender, age, category, zodiac, ext = file_name.split('.')\n", + " with open(os.path.join(DATA_FOLDER, file_name), 'r') as f:\n", + " try:\n", + " tree = ET.fromstring(magic + f.read(), parser=parser)\n", + " dates, posts = parse_blog(tree)\n", + " all_posts += posts\n", + " all_dates += dates\n", + " all_genders += [gender] * len(dates)\n", + " all_ages += [age] * len(dates)\n", + " all_categories += [category] * len(dates)\n", + " except Exception as e:\n", + " pass\n", + " #print(\"Error en {:s}\".format(file_name))\n", + "all_ids = map(str, range(len(all_posts)))" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "%%timeit\n", + "# Tiempo de la creacion del objeto\n", + "tc = TextClassifier(all_posts, all_ids)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 loop, best of 3: 2.36 s per loop\n" + ] + } + ], + "source": [ + "%%timeit\n", + "# Tiempo de la busqueda\n", + "tc.get_similar(all_ids[1],max_similars=3, term_diff_max_rank=50)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1 loop, best of 3: 17.4 s per loop\n" + ] + } + ], + "source": [ + "%%timeit\n", + "# Tiempo de creacion del clasificador\n", + "tc.make_classifier(\"topic\",all_ids, all_categories)" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10 loops, best of 3: 31.4 ms per loop\n" + ] + } + ], + "source": [ + "%%timeit\n", + "tc.classify(\"topic\", all_ids[1])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0., 0., 0., ..., 0., 0., 0.]])" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "row.toarray()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['0', '1', '10', ..., '997', '998', '999'], \n", + " dtype='|S4')" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [Root]", + "language": "python", + "name": "Python [Root]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + }, + "notify_time": "5" + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/textar/text_classifier.py b/textar/text_classifier.py index bb82c1e..8a69d9d 100644 --- a/textar/text_classifier.py +++ b/textar/text_classifier.py @@ -47,7 +47,7 @@ def __init__(self, texts, ids, vocabulary=None, encoding='utf-8'): input='content', encoding=encoding, decode_error='strict', strip_accents='ascii', lowercase=True, preprocessor=None, tokenizer=None, stop_words=es_stopwords, ngram_range=(1, 1), - analyzer='word', max_df=1.0, min_df=1, max_features=None, + analyzer='word', max_df=0.8, min_df=1, max_features=None, vocabulary=vocabulary, binary=False) self.transformer = TfidfTransformer() @@ -168,7 +168,7 @@ def _make_text_vectors(self, examples): return textvec def get_similar(self, example, max_similars=3, similarity_cutoff=None, - term_diff_cutoff=0.6, filter_list=None): + term_diff_max_rank=10, filter_list=None): """Devuelve textos similares al ejemplo dentro de los textos entrenados. Nota: @@ -181,7 +181,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, devolver. similarity_cutoff (float, optional): Valor umbral de similaridad para definir que dos textos son similares entre si. - term_diff_cutoff (float, optional): Este valor sirve para controlar + term_diff_max_rank (int, optional): Este valor sirve para controlar el umbral con el que los terminos son considerados importantes a la hora de recuperar textos (no afecta el funcionamiento de que textos se consideran cercanos, solo la cantidad de terminos @@ -235,23 +235,17 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, closest_n = closest_n[sorted_dist < similarity_cutoff] sorted_dist = sorted_dist[sorted_dist < similarity_cutoff] best_words = [] - exmpl_vec = exmpl_vec.toarray() # Calculo palabras relevantes para cada sugerencia for suggested in closest_n: - test_vec = self.tfidf_mat[suggested, :].toarray() - differences = np.abs(exmpl_vec - test_vec)**2 - normalization = exmpl_vec**2 + test_vec**2 - differences[normalization > 0] = differences[normalization > 0] / \ - normalization[normalization > 0] - differences[normalization == 0] = np.inf - differences = np.squeeze(np.array(differences)) - sort_I = np.argsort(differences) - limit = np.flatnonzero((differences[sort_I] > term_diff_cutoff) | - (np.isnan(differences[sort_I])) - )[0] + best_example = np.squeeze(exmpl_vec.toarray()) + best_example = np.flipud( + np.argsort(best_example))[:term_diff_max_rank] + test_vec = np.squeeze(self.tfidf_mat[suggested, :].toarray()) + best_test = np.flipud(np.argsort(test_vec))[:term_diff_max_rank] + best_words_ids = np.intersect1d(best_example, best_test) best_words.append([k for k, v in self.vectorizer.vocabulary_.iteritems() - if v in sort_I[:limit]]) + if v in best_words_ids]) if filter_list: filt_idx_to_general_idx = np.flatnonzero(filt_idx) text_ids = self.ids[filt_idx_to_general_idx[closest_n]] From ad4336d666cfe08c0478373b76d46e46b4436304 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Tue, 16 May 2017 17:54:20 -0300 Subject: [PATCH 04/10] Fix para python3 --- textar/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textar/__init__.py b/textar/__init__.py index 7688f12..e669d40 100644 --- a/textar/__init__.py +++ b/textar/__init__.py @@ -4,4 +4,4 @@ __email__ = 'datos@modernizacion.gob.ar' __version__ = '0.0.4' -from text_classifier import TextClassifier +from .text_classifier import TextClassifier From cf08feb8e5ac42a1bac0af5e6354ca8e83621526 Mon Sep 17 00:00:00 2001 From: meliascosta Date: Fri, 19 May 2017 11:51:46 -0300 Subject: [PATCH 05/10] Fix de integracion con python 3 --- textar/text_classifier.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/textar/text_classifier.py b/textar/text_classifier.py index 8a69d9d..1c4117d 100644 --- a/textar/text_classifier.py +++ b/textar/text_classifier.py @@ -244,7 +244,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, best_test = np.flipud(np.argsort(test_vec))[:term_diff_max_rank] best_words_ids = np.intersect1d(best_example, best_test) best_words.append([k for k, v in - self.vectorizer.vocabulary_.iteritems() + self.vectorizer.vocabulary_.items() if v in best_words_ids]) if filter_list: filt_idx_to_general_idx = np.flatnonzero(filt_idx) From 92a572679d6765d2bd9a73464855a31596257cf4 Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Tue, 30 May 2017 17:02:43 -0300 Subject: [PATCH 06/10] cambios para python3 --- textar/text_classifier.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/textar/text_classifier.py b/textar/text_classifier.py index 8a69d9d..d867ed3 100644 --- a/textar/text_classifier.py +++ b/textar/text_classifier.py @@ -1,4 +1,3 @@ -# -*- coding: utf-8 -*- u"""Módulo de clasificación de textos. Este módulo contiene a los objetos que permiten entrenar un clasificador @@ -10,8 +9,6 @@ from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.metrics.pairwise import pairwise_distances from sklearn.linear_model import SGDClassifier -from sklearn.svm import LinearSVC # No se usa mas Quitar -from scipy import sparse # No se usa mas quitar import pandas as pd import numpy as np import os @@ -103,7 +100,7 @@ def retrain(self, name, ids, labels): except AttributeError: raise AttributeError("No hay ningun clasificador con ese nombre.") indices = np.in1d(self.ids, ids) - if isinstance(labels, basestring): + if isinstance(labels, str): labels = [labels] classifier.partial_fit(self.tfidf_mat[indices, :], labels) @@ -147,7 +144,7 @@ def _make_text_vectors(self, examples): El tamaño de la matriz es de (N, T) donde N es la cantidad de ejemplos y T es la cantidad de términos en el vocabulario. """ - if isinstance(examples, basestring): + if isinstance(examples, str): if examples in self.ids: textvec = self.tfidf_mat[self.ids == examples, :] else: @@ -244,7 +241,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, best_test = np.flipud(np.argsort(test_vec))[:term_diff_max_rank] best_words_ids = np.intersect1d(best_example, best_test) best_words.append([k for k, v in - self.vectorizer.vocabulary_.iteritems() + self.vectorizer.vocabulary_.items() if v in best_words_ids]) if filter_list: filt_idx_to_general_idx = np.flatnonzero(filt_idx) From abca8b9a305688c59b1eccff8e18e58507fde1d0 Mon Sep 17 00:00:00 2001 From: meliascosta Date: Tue, 13 Jun 2017 15:07:43 -0300 Subject: [PATCH 07/10] Arreglo Bug de textos compuestos enteramente por stopwords De paso corrijo la sugerencia de palabras para truncar el maximo de terminos para textos con pocas palabras. --- textar/text_classifier.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/textar/text_classifier.py b/textar/text_classifier.py index 1c4117d..a9eb11d 100644 --- a/textar/text_classifier.py +++ b/textar/text_classifier.py @@ -206,12 +206,15 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, cantidad de textos en `filter_list`.") else: filt_idx = np.in1d(self.ids, filter_list) + elif max_similars > self.term_mat.shape[0]: raise ValueError("No se pueden pedir mas sugerencias que la \ cantidad de textos que hay almacenados.") else: - filt_idx = slice(None) - + filt_idx = np.ones(len(self.ids), dtype=bool) + # Saco los textos compuestos solo por stop_words + good_ids = np.array(np.sum(self.term_mat, 1) > 0).squeeze() + filt_idx = filt_idx & good_ids if example in self.ids: index = self.ids == example exmpl_vec = self.tfidf_mat[index, :] @@ -222,12 +225,17 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, if filter_list and example in filter_list: distances[filter_list.index(example)] = np.inf elif not filter_list: - distances[index] = np.inf + idx_example = np.searchsorted(self.ids, example) + filt_idx_example = np.searchsorted(np.flatnonzero(filt_idx), + idx_example) + distances[filt_idx_example] = np.inf else: exmpl_vec = self.vectorizer.transform([example]) # contar terminos exmpl_vec = self.transformer.transform(exmpl_vec) # calcular tfidf distances = np.squeeze(pairwise_distances(self.tfidf_mat[filt_idx], exmpl_vec)) + if np.sum(exmpl_vec) == 0: + return [], [], [] sorted_indices = np.argsort(distances) closest_n = sorted_indices[:max_similars] sorted_dist = distances[closest_n] @@ -236,12 +244,16 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, sorted_dist = sorted_dist[sorted_dist < similarity_cutoff] best_words = [] # Calculo palabras relevantes para cada sugerencia + best_example = np.squeeze(exmpl_vec.toarray()) + sorted_example_weights = np.flipud(np.argsort(best_example)) + truncated_max_rank = min(term_diff_max_rank, np.sum(best_example > 0)) + best_example = sorted_example_weights[:truncated_max_rank] for suggested in closest_n: - best_example = np.squeeze(exmpl_vec.toarray()) - best_example = np.flipud( - np.argsort(best_example))[:term_diff_max_rank] test_vec = np.squeeze(self.tfidf_mat[suggested, :].toarray()) - best_test = np.flipud(np.argsort(test_vec))[:term_diff_max_rank] + sorted_test_weights = np.flipud(np.argsort(test_vec)) + truncated_max_rank = min(term_diff_max_rank, + np.sum(test_vec > 0)) + best_test = sorted_test_weights[:truncated_max_rank] best_words_ids = np.intersect1d(best_example, best_test) best_words.append([k for k, v in self.vectorizer.vocabulary_.items() From 8ab3f5c671bd77095a5411ca7e83b464191d7154 Mon Sep 17 00:00:00 2001 From: meliascosta Date: Tue, 13 Jun 2017 15:15:47 -0300 Subject: [PATCH 08/10] Agrego DeprecationWarning Es para un parametro de las sugerencias que ya no se usa mas --- textar/text_classifier.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/textar/text_classifier.py b/textar/text_classifier.py index bba3708..30488e9 100644 --- a/textar/text_classifier.py +++ b/textar/text_classifier.py @@ -165,7 +165,8 @@ def _make_text_vectors(self, examples): return textvec def get_similar(self, example, max_similars=3, similarity_cutoff=None, - term_diff_max_rank=10, filter_list=None): + term_diff_max_rank=10, filter_list=None, + term_diff_cutoff=None): """Devuelve textos similares al ejemplo dentro de los textos entrenados. Nota: @@ -185,6 +186,7 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, que se devuelven en best_words). filter_list (list): Lista de ids de textos en la cual buscar textos similares. + term_diff_cutoff (float): Deprecado. Se quitara en el futuro. Returns: tuple (list, list, list): (text_ids, sorted_dist, best_words) @@ -197,6 +199,9 @@ def get_similar(self, example, max_similars=3, similarity_cutoff=None, sugerencia. """ + if term_diff_cutoff: + warnings.warn('Deprecado. Quedo sin uso. Se quitara en el futuro.', + DeprecationWarning) if filter_list: if max_similars > len(filter_list): raise ValueError("No se pueden pedir mas sugerencias que la \ From 718463d52d1a933988e16c8dc8e7f21a0ee867d8 Mon Sep 17 00:00:00 2001 From: meliascosta Date: Tue, 13 Jun 2017 15:35:10 -0300 Subject: [PATCH 09/10] Agrego encoding y cambio import de sklearn --- tests/test_text_classifier.py | 3 +-- textar/text_classifier.py | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_text_classifier.py b/tests/test_text_classifier.py index 0a24663..394e6d6 100644 --- a/tests/test_text_classifier.py +++ b/tests/test_text_classifier.py @@ -11,10 +11,9 @@ import os import codecs import numpy as np -from sklearn.cross_validation import train_test_split +from sklearn.model_selection import train_test_split from sklearn.datasets import fetch_20newsgroups sys.path.insert(0, os.path.abspath('..')) - from textar import TextClassifier diff --git a/textar/text_classifier.py b/textar/text_classifier.py index 30488e9..429cced 100644 --- a/textar/text_classifier.py +++ b/textar/text_classifier.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + u"""Módulo de clasificación de textos. Este módulo contiene a los objetos que permiten entrenar un clasificador From 346f1a57d235cc1b1e6a2e1ad21714b616847c8b Mon Sep 17 00:00:00 2001 From: Ignacio Heredia Date: Tue, 13 Jun 2017 15:52:44 -0300 Subject: [PATCH 10/10] Modifico tests para python3 y cambio assert de igualdad de resultados --- tests/test_text_classifier.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/test_text_classifier.py b/tests/test_text_classifier.py index 0a24663..25fc61a 100644 --- a/tests/test_text_classifier.py +++ b/tests/test_text_classifier.py @@ -32,7 +32,7 @@ def test_get_similar(self): "El edificio más antiguo tiene muchas cuadros caros porque era de un multimillonario", "El edificio más moderno tiene muchas programadoras que comen manzanas durante el almuerzo grupal" ], - ids=map(str, range(4)) + ids=list(map(str, range(4))) ) ids, distancias, palabras_comunes = tc.get_similar( @@ -42,10 +42,14 @@ def test_get_similar(self): self.assertEqual(ids, ['0', '3', '2', '1']) self.assertEqual( - palabras_comunes, + [ + sorted(palabras) + for palabras in palabras_comunes + ] + , [ [u'edificio', u'manzanas'], - [u'edificio', u'muchas', u'manzanas'], + [u'edificio', u'manzanas', u'muchas'], [u'edificio', u'muchas'], [u'muchas'] ] ) @@ -60,13 +64,13 @@ def test_classify(self): "Para hacer una torta de naranja se necesita harina, huevos, leche, ralladura de naranja y polvo de hornear", "Para hacer un lemon pie se necesita crema, ralladura de limón, huevos, leche y harina" ], - ids=map(str, range(6)) + ids=list(map(str, range(6))) ) # entrena un clasificador tc.make_classifier( name="recetas_classifier", - ids=map(str, range(6)), + ids=list(map(str, range(6))), labels=["Comida", "Comida", "Trago", "Trago", "Postre", "Postre"] )