Merge pull request #1 from MediaComem/compute-citations-count

Compute citations count
MediaComem · Nov 15, 2023 · 7df7ee4 · 7df7ee4
2 parents 78aea85 + f278c80
commit 7df7ee4
Show file tree

Hide file tree

Showing 487 changed files with 167,402 additions and 304 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitignore b/.gitignore
@@ -108,4 +108,8 @@ venv.bak/
 
 # mypy
 .mypy_cache/
+
 .Rproj.user
+dataset/config/PMC-Dataset-Oct8_2023.csv
+dataset/config/PLOS-Dataset-Oct8_2023.csv
+data
diff --git a/README.md b/README.md
@@ -1,9 +1,19 @@
 # das-public
 
+
+## Code and data
+
+* See the [dataset folder](dataset) to creates from the [PubMed Central OA collection](https://www.ncbi.nlm.nih.gov/pmc/tools/openftlist) dataset, an CSV file to analyze the h_index impact on publication citation.
+* To validate the code, please refer the [testing procedure](test.md)
+
+# Original work
+This repository is based on the previous work here:
+
 [![DOI](https://zenodo.org/badge/180121200.svg)](https://zenodo.org/badge/latestdoi/180121200)
 [![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/gh/alan-turing-institute/das-public/master?filepath=notebooks%2FDescriptiveFigures.ipynb)
 
-## Publication
+## Previous publications
+The original code is mentioned in the following papers:
 
 * 📃 Preprint: https://arxiv.org/abs/1907.02565.
 * 📝 Peer reviewed publication: https://doi.org/10.1371/journal.pone.0230416

diff --git a/dataset/.DS_Store b/dataset/.DS_Store
diff --git a/dataset/README.md b/dataset/README.md
@@ -14,14 +14,16 @@ Folder containing the necessary code to create a dataset for analysis from the P
 ## Instructions
 
 1. Download the Pubmed OA collection, e.g. via their FTP service: https://www.ncbi.nlm.nih.gov/pmc/tools/ftp. Optionally sample it using the [sample_dev_set.py](sample_dev_set.py) script (or use the development dataset of 1000 articles which is provided in the [dev set folder](dev_set)).
-2. Setup a MongoDB and update the [config file](config/config.conf).
-3. Run the [parser_main.py](parser_main.py) script, which will create a first collection of articles in Mongo.
-4. Run the [calculate_stats.py](calculate_stats.py) script, which will calculate citation counts for articles and authors and create the relative collections in Mongo.
-5. Run the [get_export.py](get_export.py) script, which will create a first export of the dataset in the [exports folder](exports).
-6. Run the [get_das_unique.py](get_das_unique.py) script, which will pull out unique DAS for classification.
-7. Follow the instructions in the [DAS classifier README](das_classifier/README.md).
-8. Run the [get_export_merged.py](get_export_merged.py) script, to create the final dataset for analysis.
-9. Optionally, run the [evaluation_plos.py](evaluation_plos.py) and [get_authors_top.py](get_authors_top.py) for evaluation.
+2. Setup a MongoDB and update the [config file](config/config.conf) or run `docker compose up` with the current config.
+3. Uncompress `PLOS_Dataset_Classification.zip` in the config folder then move the folder content into the current folder.
+4. Run the [parser_main.py](parser_main.py) script, which will create a first collection of articles in Mongo.
+5. Run the [calculate_stats.py](calculate_stats.py) script, which will calculate citation counts for articles and authors and create the relative collections in Mongo.
+6. Run the [calculate_h_index.py](calculate_h_index.py) script, which will update the `h_indexes` elements of each documents with the result of the h_index calculaiton.
+7. Run the [get_export.py](get_export.py) script, which will create a first export of the dataset in the [exports folder](exports).
+8. Run the [get_das_unique.py](get_das_unique.py) script, which will pull out unique DAS for classification.
+9. Follow the instructions in the [DAS classifier README](das_classifier/README.md).
+10. Run the [get_export_merged.py](get_export_merged.py) script, to create the final dataset for analysis.
+11. Optionally, run the [evaluation_plos.py](evaluation_plos.py) and [get_authors_top.py](get_authors_top.py) for evaluation.
 
 ## Requirements
 

diff --git a/dataset/calculate_h_index.py b/dataset/calculate_h_index.py
@@ -0,0 +1,44 @@
+# MongoDB
+from pymongo import MongoClient
+from pymongo import HASHED, ASCENDING
+from configparser import ConfigParser
+config_set = "localhost" # this is in localhost
+config = ConfigParser(allow_no_value=False)
+config.read("config/config.conf")
+mongo_db = config.get(config_set, 'db-name')
+mongo_user = config.get(config_set, 'username')
+mongo_pwd = config.get(config_set, 'password')
+mongo_auth = config.get(config_set, 'auth-db')
+mongo_host = config.get(config_set, 'db-host')
+mongo_port = config.get(config_set, 'db-port')
+client = MongoClient(mongo_host, 
+                     username=mongo_user,
+                     password=mongo_pwd,)
+db = client[mongo_db]
+
+collection = db.stats_dev
+collection_authors = db.authors_dev
+collection_stats = db.stats_with_hindex
+
+
+records = list()
+for record in collection.find():
+	h_indexes = list()
+	for a in record["authors"]:		
+		author_h_index = collection_authors.find_one({"index": a})['h_index']
+		h_indexes.append(author_h_index)
+	record['h-indexes'] = h_indexes
+	records.append(record)
+print("End of h-indexes")
+
+collection_stats.insert_many(records)
+
+collection_stats.create_index([('id_doi', HASHED)], background=True)
+collection_stats.create_index([('id_pmc', ASCENDING)],
+	                        background=True)
+collection_stats.create_index([('id_pmid', ASCENDING)],
+	                        background=True)
+collection_stats.create_index([('id_publisher', HASHED)],
+	                        background=True)
+
+print("\nFinished!")
diff --git a/dataset/calculate_stats.py b/dataset/calculate_stats.py
@@ -23,9 +23,10 @@
 mongo_auth = config.get(config_set, 'auth-db')
 mongo_host = config.get(config_set, 'db-host')
 mongo_port = config.get(config_set, 'db-port')
-client = MongoClient(mongo_host)
+client = MongoClient(mongo_host, 
+                     username=mongo_user,
+                     password=mongo_pwd,)
 db = client[mongo_db]
-db.authenticate(mongo_user, mongo_pwd, source=mongo_auth)
 
 # select which collection to use in Mongo, start by dropping if needed (we do not update an existing collection here)
 db.drop_collection("stats_dev")
@@ -35,12 +36,11 @@
 collection_records = db.publications_dev
 
 if __name__ == "__main__":
-
 	# create global publication index and load relevant info in memory
 	pmid_dict = defaultdict(int)
 	rev_pmid_dict = defaultdict(int)
 	pmc_dict = defaultdict(int)
-	rev_pmc_dict = defaultdict(int)
+	rev_pmc_dict = defaultdict(str)
 	doi_dict = defaultdict(str)
 	rev_doi_dict = defaultdict(int)
 	publisher_dict = defaultdict(str)
@@ -66,7 +66,7 @@
 		# identification
 		if record["id_pmc"]:
 			pmc_dict[counter] = record["id_pmc"]
-			rev_pmid_dict[record["id_pmc"]] = counter
+			rev_pmc_dict[record["id_pmc"]] = counter
 		if record["id_pmid"]:
 			pmid_dict[counter] = record["id_pmid"]
 			rev_pmid_dict[record["id_pmid"]] = counter
@@ -114,23 +114,23 @@
 				author_counter += 1
 
 		records[counter] = {"publication_id":record["_id"],"title":record["title"],"id_pmc":record["id_pmc"],"id_pmid":record["id_pmid"],"id_publisher":record["id_publisher"],"id_doi":record["id_doi"],
-		                    "year":year,"month":month,"has_month":has_month,"is_plos":record["is_plos"],"is_bmc":record["is_bmc"],"has_das":record["has_das"],"authors":paper_authors,"authors_full":paper_authors_full}
+		                    "year":year,"month":month,"has_month":has_month,"is_plos":record["is_plos"],"is_bmc":record["is_bmc"],"is_pmc":record["is_pmc"],"has_das":record["has_das"],"authors":paper_authors,"authors_full":paper_authors_full}
 		counter += 1
-
+	print("End of record")
 	# create author matrices
 	for author in rev_authors_dict.keys():
-		authors_citations[author] = np.zeros((authors_n_publications[author],max_date-min_date+1))
+		authors_citations[author] = np.zeros(authors_n_publications[author])
+	print("End of matrice")
 	logger.info("Finished creating dictionaries")
-
 	# extract citation histories for indicators
 	for record in collection_records.find():
 		counter = mongo_ids[record["_id"]]
 		for ref in record["references"]:
 			if ref["year"] and len(ref["identifiers"]):
 				ref_counter = None
 				for local_id in ref["identifiers"]:
-					if local_id["type"] == "pmid" and local_id["id"] in rev_pmid_dict.keys():
-						ref_counter = rev_pmid_dict[local_id["id"]]
+					if local_id["type"] == "pmid" and int(local_id["id"]) in rev_pmid_dict.keys():
+						ref_counter = rev_pmid_dict[int(local_id["id"])]
 						break
 					elif local_id["type"] == "pmc" and local_id["id"] in rev_pmc_dict.keys():
 						ref_counter = rev_pmc_dict[local_id["id"]]
@@ -143,7 +143,10 @@
 						break
 				if not ref_counter:
 					continue
-				citation_year = math.floor(((records[counter]["year"]*12 + records[counter]["month"]) - (records[ref_counter]["year"]*12 + records[ref_counter]["month"])) / 12) # bin into years 0, 1, 2, etc. from publication
+				# citation month level base publication date
+				citation_year = records[counter]["year"] - records[ref_counter]["year"]
+				if records[counter]["month"] < records[ref_counter]["month"]:
+					citation_year = citation_year - 1
 				if citation_year < 0:
 					citation_year = 0  # put as a citation during the first year for citations into the future and the like
 				citations[ref_counter][citation_year] += 1
@@ -152,42 +155,31 @@
 				for a in records[ref_counter]["authors"]:
 					# a is the author_counter
 					author_pub_index = authors_publications[a].index(ref_counter)
-					authors_citations[a][author_pub_index,date_index] += 1
-
+					authors_citations[a][author_pub_index] += 1
+	print("End of computation citations")
 	for k,v in records.items():
 		# sort year indexes and convert them into strings for Mongo
 		cd = sorted({x: y for x, y in citations[k].items()}.items(), key=lambda x: x[0], reverse=False)
 		cdd = OrderedDict()
+
 		for kk,vv in cd:
 			cdd.update({str(kk):vv})
 		v["citation_counts"] = cdd
 		v["citations_total"] = sum([x for x in citations[k].values()])
+		v["citations_one"] = sum([x for y,x in citations[k].items() if y < 1])
 		v["citations_two"] = sum([x for y,x in citations[k].items() if y < 2])
 		v["citations_three"] = sum([x for y,x in citations[k].items() if y < 3])
-		v["citations_five"] = sum([x for y,x in citations[k].items() if y < 5])
-		date_index = v["year"] * 12 + v["month"] - min_date
-		# for every author, get h-index before this date
-		h_indexes = list()
-		for a in v["authors"]:
-			h_index = 0
-			A = authors_citations[a][:,:date_index-1]
-			local_cit_counts = list(A.sum(axis=1))
-			for p in local_cit_counts:
-				local_pub_counts = len([x for x in local_cit_counts if x >= p])
-				if local_pub_counts >= h_index and p >= h_index:
-					h_index = min(p,local_pub_counts)
-			h_indexes.append(int(h_index))
-		v["h_indexes"] = h_indexes
 
 	logger.info("Finished parsing all records")
+	print("End of parsing")
 
 	# export authors and their h-indexes
 	authors_dump = list()
 	for k,v in authors_full.items():
 		A = authors_citations[k]
 		a = {"index":k,"name":v,"tot_cit":A.sum()}
 		h_index = 0
-		local_cit_counts = list(A.sum(axis=1))
+		local_cit_counts = list(A)
 		for p in local_cit_counts:
 			local_pub_counts = len([x for x in local_cit_counts if x >= p])
 			if local_pub_counts >= h_index and p >= h_index:
@@ -198,7 +190,7 @@
 		a["h_index"] = int(h_index)
 		a["publications"] = a_citations
 		authors_dump.append(a)
-
+	print("End of authors")
 	# dump all
 	collection.insert_many([r for r in records.values()])
 	collection_authors.insert_many(authors_dump)
@@ -214,6 +206,8 @@
 	                        background=True)
 	collection_authors.create_index([('tot_cit', ASCENDING)],
 	                        background=True)
+	collection_authors.create_index([('index', ASCENDING)],
+	                        background=True)
 
 	logger.info("Finished!")
 	print("\nFinished!")
diff --git a/dataset/config/PLOS_Dataset_Classification.zip b/dataset/config/PLOS_Dataset_Classification.zip
diff --git a/dataset/config/config.conf b/dataset/config/config.conf
@@ -1,7 +1,7 @@
 [localhost]
 db-name = contexts
-username = writer
-password = 1243
+username = user
+password = pass
 auth-db = contexts
 db-host = localhost
 db-port = 27017