diff --git a/allmeans-tm/.DS_Store b/AllMeans/.DS_Store similarity index 100% rename from allmeans-tm/.DS_Store rename to AllMeans/.DS_Store diff --git a/allmeans-tm/__init__.py b/AllMeans/__init__.py similarity index 100% rename from allmeans-tm/__init__.py rename to AllMeans/__init__.py diff --git a/allmeans-tm/all_means.py b/AllMeans/all_means.py similarity index 99% rename from allmeans-tm/all_means.py rename to AllMeans/all_means.py index a3eb6df..8e70a7a 100644 --- a/allmeans-tm/all_means.py +++ b/AllMeans/all_means.py @@ -7,7 +7,7 @@ from sklearn.metrics import silhouette_score, davies_bouldin_score from itertools import product from sklearn.metrics.pairwise import cosine_similarity -from resources import download_nltk_resources, get_sentence_transformer_model +from .resources import download_nltk_resources, get_sentence_transformer_model import warnings class AllMeans: diff --git a/allmeans-tm/resources.py b/AllMeans/resources.py similarity index 100% rename from allmeans-tm/resources.py rename to AllMeans/resources.py diff --git a/README.md b/README.md index cda8f55..e63953a 100644 --- a/README.md +++ b/README.md @@ -2,29 +2,27 @@ Automatic Topic Modelling (TM) using minimal user input and computational resources. I made this because my biggest issue with most TM modules is simple. If I knew how many topics I wanted, I would already have enough information about the text, such that performing TM would be redundant. AllMeans does not aim to replace existing TM frameworks, but instead aims to tackle the aspect of required user input to derive meaningful insights. With AllMeans, the user is simply required to pass a text, and run one method, with optionally ZERO decisions. -See `Basic Modelling` example, below. AllMeans is designed to be simple, user-friendly, and practical. It doesn't invent anything that doesn't already exist in the passed text (it doesn't require loading enormous Word Embeddings models like GloVe). All that is needed is a text, in one string (no pre-processing needed), to create an AllMeans object, and to run the .model_topics() method. +See `Basic Modelling` example, below. AllMeans is designed to be simple, user-friendly, and practical. It doesn't invent anything that doesn't already exist in the passed text (it doesn't require loading enormous Word Embeddings models like GloVe). All that is needed is a text (string), in one string (no pre-processing needed), to create an AllMeans object, and to run the .model_topics() method. -## Usage +Though AllMeans is not itself a single unsupervised algorithm, it relies on unsupervised algorithms to perform topic modelling, which inherently works better with more data. AllMeans was developed with tests on texts from 1,000 to 100,000 characters in length, as it is intended to be flexible, but quality of results will likely typically correlate positively with the size of the passed text. -Install using: +## Usage -``` -$ pip install allmeans-tm -``` +Install using: `$ pip install AllMeans`. ### Modelling Topics with AllMeans.model_topics() -There are only two arguments to the .model_topics(), `early_stop` and `verbose`. Verbosity is a boolean, offering to print progress and a glimpse of the results as the method runs, and `early_stop` strongly positively correlates with the number of resulting topics found, though it is not a 1:1 relationship (i.e., passing early_stop = 3 will not necessarily result in 3 topics). As the method largely relies on iteratively comparing various Kmeans clustering results (through an averaged silhouette_score and davies_bouldin_score - both of which sklearn's implementations), the early_stop value (default = 2) determines after how many consecutively negatively trending iterations the method stops. The motivation for this being that there is typically a certain Kmeans value that scores best, after which point scores trend downwards, making these iterations often redundant. Thus, a lower early_stop value (\~2) will significantly decrease computational expense and time, but may also change performance. As each early_stop value does not necessarily build on lower values (for example, early_stop = 3 is not necessarily the same topics as early_stop = 2, plus *x* more topics), I suggest trying 2 or 3 values (I like to test something like early_stop = \[2, 3, 4, 5\]) to see how the passed text can be represented. +There are only two arguments to the .model_topics(), `early_stop` and `verbose`. Verbosity is a boolean, offering to print progress and a glimpse of the results as the method runs, and `early_stop` strongly positively correlates with the number of resulting topics found, though it is not a 1:1 relationship (i.e., passing early_stop = 3 will not necessarily result in 3 topics). As the method largely relies on iteratively comparing various Kmeans clustering results (through an averaged silhouette_score and davies_bouldin_score - both of which sklearn's implementations), the early_stop value (default = 2) determines after how many consecutively negatively trending iterations the method stops. The motivation for this being that there is typically a certain Kmeans value that scores best, after which point scores trend downwards, making these iterations often redundant. Thus, a lower early_stop value (\~2) will significantly decrease computational expense and time, but may also change performance. As each early_stop value does not necessarily build on lower values (for example, early_stop = 3 is not necessarily the same topics as early_stop = 2, plus *x* more topics), I suggest trying 2 or 3 values (I like to test a range of values for early_stop such as \[2, 3, 4, 5\]) to see how the passed text can be represented. ## Examples ### Basic Modelling ``` -# !pip install allmeans-tm -import AllMeans +# !pip install AllMeans +from AllMeans import AllMeans -# assuming you have a text in the variable `text` +# assuming you have a text in the string variable `text` allmeans = AllMeans(text = text) clusters = allmeans.model_topics( early_stop = 2, # default value @@ -44,7 +42,7 @@ This example gets the text from the "Linguistics" Wikipedia, models its topics a ``` # !pip3 install wikipedia-api import wikipediaapi -wiki_wiki = wikipediaapi.Wikipedia(USER_AGENT_HERE', 'en') # check https://pypi.org/project/Wikipedia-API/ "user_agent" to understand this +wiki_wiki = wikipediaapi.Wikipedia('USER_AGENT_HERE', 'en') # check https://pypi.org/project/Wikipedia-API/ "user_agent" to understand this page_py = wiki_wiki.page("Linguistics") # gets the text of entire Wikipedia "Linguistics" page text = page_py.text # returns str of entire page text -> check package docs for more useful methods @@ -59,10 +57,12 @@ def average_compound_sentiment(texts): avg_score = sum(compound_scores) / len(compound_scores) if compound_scores else 0 return avg_score +# !pip install AllMeans +from AllMeans import AllMeans # Use AllMeans to model topics from page allmeans = AllMeans(text = text) clusters = allmeans.model_topics(early_stop = 5, verbose = True) ->>> Note: there will be many printouts here due to verbose = True +# >>> Note: there will be many printouts here due to verbose = True # Prepare the topics-sentences distribution data and mean sentiment per topic dist = {lab: len(sents) for lab, sents in clusters.items()} diff --git a/setup.py b/setup.py index 6ba570f..7455c52 100644 --- a/setup.py +++ b/setup.py @@ -1,15 +1,21 @@ from setuptools import setup, find_packages +# README with open("README.md", "r") as f: description = f.read() +# LICENSE +with open("LICENSE", "r") as f: + license = f.read() + setup( - name = 'allmeans-tm', - version = '0.2.0', + name = 'AllMeans', + version = '1.0.3', author = 'Kai Maurin-Jones', - description = 'A package for automatic topic modelling', + description = 'A package for fully automatic topic modelling', packages = find_packages(), python_requires = '>=3.11.4', + license = license, install_requires = [ 'nltk==3.8.1', 'numpy==1.24.3',