bug fixes

kmaurinjones · Feb 18, 2024 · 330f765 · 330f765
1 parent 0769334
commit 330f765
Show file tree

Hide file tree

Showing 6 changed files with 22 additions and 16 deletions.
diff --git a/allmeans-tm/.DS_Store → AllMeans/.DS_Store b/allmeans-tm/.DS_Store → AllMeans/.DS_Store
diff --git a/allmeans-tm/__init__.py → AllMeans/__init__.py b/allmeans-tm/__init__.py → AllMeans/__init__.py
diff --git a/allmeans-tm/all_means.py → AllMeans/all_means.py b/allmeans-tm/all_means.py → AllMeans/all_means.py
@@ -7,7 +7,7 @@
 from sklearn.metrics import silhouette_score, davies_bouldin_score
 from itertools import product
 from sklearn.metrics.pairwise import cosine_similarity
-from resources import download_nltk_resources, get_sentence_transformer_model
+from .resources import download_nltk_resources, get_sentence_transformer_model
 import warnings
 
 class AllMeans:

diff --git a/allmeans-tm/resources.py → AllMeans/resources.py b/allmeans-tm/resources.py → AllMeans/resources.py
diff --git a/README.md b/README.md
@@ -2,29 +2,27 @@
 
 Automatic Topic Modelling (TM) using minimal user input and computational resources. I made this because my biggest issue with most TM modules is simple. If I knew how many topics I wanted, I would already have enough information about the text, such that performing TM would be redundant. AllMeans does not aim to replace existing TM frameworks, but instead aims to tackle the aspect of required user input to derive meaningful insights. With AllMeans, the user is simply required to pass a text, and run one method, with optionally ZERO decisions.
 
-See `Basic Modelling` example, below. AllMeans is designed to be simple, user-friendly, and practical. It doesn't invent anything that doesn't already exist in the passed text (it doesn't require loading enormous Word Embeddings models like GloVe). All that is needed is a text, in one string (no pre-processing needed), to create an AllMeans object, and to run the .model_topics() method.
+See `Basic Modelling` example, below. AllMeans is designed to be simple, user-friendly, and practical. It doesn't invent anything that doesn't already exist in the passed text (it doesn't require loading enormous Word Embeddings models like GloVe). All that is needed is a text (string), in one string (no pre-processing needed), to create an AllMeans object, and to run the .model_topics() method.
 
-## Usage
+Though AllMeans is not itself a single unsupervised algorithm, it relies on unsupervised algorithms to perform topic modelling, which inherently works better with more data. AllMeans was developed with tests on texts from 1,000 to 100,000 characters in length, as it is intended to be flexible, but quality of results will likely typically correlate positively with the size of the passed text.
 
-Install using:
+## Usage
 
-```         
-$ pip install allmeans-tm
-```
+Install using: `$ pip install AllMeans`.
 
 ### Modelling Topics with AllMeans.model_topics()
 
-There are only two arguments to the .model_topics(), `early_stop` and `verbose`. Verbosity is a boolean, offering to print progress and a glimpse of the results as the method runs, and `early_stop` strongly positively correlates with the number of resulting topics found, though it is not a 1:1 relationship (i.e., passing early_stop = 3 will not necessarily result in 3 topics). As the method largely relies on iteratively comparing various Kmeans clustering results (through an averaged silhouette_score and davies_bouldin_score - both of which sklearn's implementations), the early_stop value (default = 2) determines after how many consecutively negatively trending iterations the method stops. The motivation for this being that there is typically a certain Kmeans value that scores best, after which point scores trend downwards, making these iterations often redundant. Thus, a lower early_stop value (\~2) will significantly decrease computational expense and time, but may also change performance. As each early_stop value does not necessarily build on lower values (for example, early_stop = 3 is not necessarily the same topics as early_stop = 2, plus *x* more topics), I suggest trying 2 or 3 values (I like to test something like early_stop = \[2, 3, 4, 5\]) to see how the passed text can be represented.
+There are only two arguments to the .model_topics(), `early_stop` and `verbose`. Verbosity is a boolean, offering to print progress and a glimpse of the results as the method runs, and `early_stop` strongly positively correlates with the number of resulting topics found, though it is not a 1:1 relationship (i.e., passing early_stop = 3 will not necessarily result in 3 topics). As the method largely relies on iteratively comparing various Kmeans clustering results (through an averaged silhouette_score and davies_bouldin_score - both of which sklearn's implementations), the early_stop value (default = 2) determines after how many consecutively negatively trending iterations the method stops. The motivation for this being that there is typically a certain Kmeans value that scores best, after which point scores trend downwards, making these iterations often redundant. Thus, a lower early_stop value (\~2) will significantly decrease computational expense and time, but may also change performance. As each early_stop value does not necessarily build on lower values (for example, early_stop = 3 is not necessarily the same topics as early_stop = 2, plus *x* more topics), I suggest trying 2 or 3 values (I like to test a range of values for early_stop such as \[2, 3, 4, 5\]) to see how the passed text can be represented.
 
 ## Examples
 
 ### Basic Modelling
 
 ```         
-# !pip install allmeans-tm
-import AllMeans
+# !pip install AllMeans
+from AllMeans import AllMeans
 
-# assuming you have a text in the variable `text`
+# assuming you have a text in the string variable `text`
 allmeans = AllMeans(text = text)
 clusters = allmeans.model_topics(
     early_stop = 2, # default value
@@ -44,7 +42,7 @@ This example gets the text from the "Linguistics" Wikipedia, models its topics a
 ```         
 # !pip3 install wikipedia-api
 import wikipediaapi
-wiki_wiki = wikipediaapi.Wikipedia(USER_AGENT_HERE', 'en') # check https://pypi.org/project/Wikipedia-API/ "user_agent" to understand this
+wiki_wiki = wikipediaapi.Wikipedia('USER_AGENT_HERE', 'en') # check https://pypi.org/project/Wikipedia-API/ "user_agent" to understand this
 page_py = wiki_wiki.page("Linguistics") # gets the text of entire Wikipedia "Linguistics" page
 text = page_py.text # returns str of entire page text -> check package docs for more useful methods
 
@@ -59,10 +57,12 @@ def average_compound_sentiment(texts):
     avg_score = sum(compound_scores) / len(compound_scores) if compound_scores else 0
     return avg_score
 
+# !pip install AllMeans
+from AllMeans import AllMeans
 # Use AllMeans to model topics from page
 allmeans = AllMeans(text = text)
 clusters = allmeans.model_topics(early_stop = 5, verbose = True)
->>> Note: there will be many printouts here due to verbose = True
+# >>> Note: there will be many printouts here due to verbose = True
 
 # Prepare the topics-sentences distribution data and mean sentiment per topic
 dist = {lab: len(sents) for lab, sents in clusters.items()}

diff --git a/setup.py b/setup.py
@@ -1,15 +1,21 @@
 from setuptools import setup, find_packages
 
+# README
 with open("README.md", "r") as f:
     description = f.read()
 
+# LICENSE
+with open("LICENSE", "r") as f:
+    license = f.read()
+
 setup(
-    name = 'allmeans-tm',
-    version = '0.2.0',
+    name = 'AllMeans',
+    version = '1.0.3',
     author = 'Kai Maurin-Jones',
-    description = 'A package for automatic topic modelling',
+    description = 'A package for fully automatic topic modelling',
     packages = find_packages(),
     python_requires = '>=3.11.4',
+    license = license,
     install_requires = [
         'nltk==3.8.1',
         'numpy==1.24.3',