Merge pull request #6 from MediaComem/update-h-index-calculation

Update h index calculation
MediaComem · Apr 22, 2024 · 057fa90 · 057fa90
2 parents 747c190 + 55aa586
commit 057fa90
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 63 deletions.
diff --git a/dataset/calculate_h_index.py b/dataset/calculate_h_index.py
diff --git a/dataset/calculate_stats.py b/dataset/calculate_stats.py
@@ -7,6 +7,7 @@
 import re, math
 import numpy as np
 import logging
+from scholarmetrics import hindex
 logging.basicConfig(filename='logs/stats.log',filemode="w+",level=logging.INFO)
 logger = logging.getLogger("Main")
 
@@ -117,10 +118,6 @@
 		                    "year":year,"month":month,"has_month":has_month,"is_plos":record["is_plos"],"is_bmc":record["is_bmc"],"is_pmc":record["is_pmc"],"has_das":record["has_das"],"authors":paper_authors,"authors_full":paper_authors_full}
 		counter += 1
 	print("End of record")
-	# create author matrices
-	for author in rev_authors_dict.keys():
-		authors_citations[author] = np.zeros(authors_n_publications[author])
-	print("End of matrice")
 	logger.info("Finished creating dictionaries")
 	# extract citation histories for indicators
 	for record in collection_records.find():
@@ -153,9 +150,14 @@
 				# add citation to all paper authors
 				date_index = records[ref_counter]["year"]*12 + records[ref_counter]["month"] - min_date
 				for a in records[ref_counter]["authors"]:
+					if not a in authors_citations:
+						authors_citations[a] = dict()
+					if not ref_counter in authors_citations[a]: 
+						authors_citations[a][ref_counter] = dict()
+					if not date_index in authors_citations[a][ref_counter]:
+						authors_citations[a][ref_counter][date_index] = 0
 					# a is the author_counter
-					author_pub_index = authors_publications[a].index(ref_counter)
-					authors_citations[a][author_pub_index] += 1
+					authors_citations[a][ref_counter][date_index] += 1
 	print("End of computation citations")
 	for k,v in records.items():
 		# sort year indexes and convert them into strings for Mongo
@@ -169,6 +171,29 @@
 		v["citations_one"] = sum([x for y,x in citations[k].items() if y < 1])
 		v["citations_two"] = sum([x for y,x in citations[k].items() if y < 2])
 		v["citations_three"] = sum([x for y,x in citations[k].items() if y < 3])
+		date_index = v["year"] * 12 + v["month"] - min_date
+		# Compute h-index for authors at time of publication
+		h_indexes = list()
+		for a in v["authors"]:
+			h_index = 0
+			local_cit_counts = dict()
+			if a in authors_citations:
+				for index, value in authors_citations[a].items():
+					for date, count in value.items():
+						# We took only citations before publication date
+						if date < date_index:
+							if not index in local_cit_counts:
+								local_cit_counts[index] = count
+							else:
+								local_cit_counts[index] += count
+				arrayOfCitation = list(local_cit_counts.values())
+				arrayOfCitation = sorted(arrayOfCitation, reverse=True)
+				h_index = hindex(arrayOfCitation)
+			else:
+				authors_citations[a] = dict()
+
+			h_indexes.append(int(h_index))
+		v["h_indexes"] = h_indexes
 
 	logger.info("Finished parsing all records")
 	print("End of parsing")
@@ -177,16 +202,27 @@
 	authors_dump = list()
 	for k,v in authors_full.items():
 		A = authors_citations[k]
-		a = {"index":k,"name":v,"tot_cit":A.sum()}
+		total_cit_cout = 0
 		h_index = 0
-		local_cit_counts = list(A)
-		for p in local_cit_counts:
-			local_pub_counts = len([x for x in local_cit_counts if x >= p])
-			if local_pub_counts >= h_index and p >= h_index:
-				h_index = min(p, local_pub_counts)
+		publication_cit_count = list(A)
+		local_cit_counts = dict()
+		# Compute global h-index
+		for index, publication in A.items():
+			local_cit_counts[index] = 0
+			for date, count in publication.items():
+				total_cit_cout += count
+				if index in local_cit_counts:
+					local_cit_counts[index] += count
+			arrayOfCitation = list(local_cit_counts.values())
+			arrayOfCitation = sorted(arrayOfCitation, reverse=True)
+			h_index = hindex(arrayOfCitation)
+		a = {"index":k,"name":v,"tot_cit":total_cit_cout}
 		a_citations = list()
-		for x,y in zip(local_cit_counts,authors_publications[k]):
-			a_citations.append({"title":records[y]["title"],"year":records[y]["year"],"publication_id":records[y]["publication_id"],"paper_id":y,"n_cit":x})
+		for y in authors_publications[k]:
+			cit = 0
+			if y in local_cit_counts:
+				cit = local_cit_counts[y]
+			a_citations.append({"title":records[y]["title"],"year":records[y]["year"],"publication_id":records[y]["publication_id"],"paper_id":y,"n_cit": cit})
 		a["h_index"] = int(h_index)
 		a["publications"] = a_citations
 		authors_dump.append(a)

diff --git a/dataset/exports/export_plos.csv.zip b/dataset/exports/export_plos.csv.zip
diff --git a/dataset/parser_main.py b/dataset/parser_main.py
@@ -14,7 +14,7 @@
 # logs and basics
 logging.basicConfig(filename='logs/parser_main.log',filemode="w+",level=logging.INFO)
 logger = logging.getLogger("Main")
-plos_file = 'config/PLOS-v5.2.csv'
+plos_file = 'config/PLOS_v5.2.csv'
 pmc_file = 'config/PMC_v5.2.csv'
 plos_csv_header=['Journal','Publisher','Submission_Day','Submission_Month','Submission_Year','Acceptance_Day','Acceptance_Month','Acceptance_Year','Publication_Day','Publication_Month','Publication_Year','DOI','Type_of_Article','Article_Title','Major.Mesh.Terms','Country','Corresponding_Author_Institution','Funding_Statement','Data_Section.Text.Generated','Data_Generated','Data_Section.Text.Shared','Data_Shared','Data.Location','DA_data','Accessions','URL_data','Repositories_data','Data_DOIs','Preprint_Match','Preprint_DOI','Preprint_Title','Preprint_Authors','Preprint_Day','Preprint_Month','Preprint_Year','Preprint_URL','Preprint_Server','Code_Section.Text.Generated','Code_Generated','Code_Section.Text.Shared','Code_Shared','Code_Location','URL_code','Repositories_code','Quarter','ANZSRC']
 JOB_LIMIT = 50000 # controls how many articles to process per batch
@@ -55,7 +55,7 @@ def get_doi_from_file(filename):
 
 if __name__ == "__main__":
 
-    root_dirs = ["dev_set_2"] # if you want to use a small, dev set sampled using sample_dev_set.py
+    root_dirs = ["dev_set"] # if you want to use a small, dev set sampled using sample_dev_set.py
     if MODE:
         # change this folder to where your PubMed OA dump actually is. You can list multiple folders, all xml files within will be processed
         root_dirs = ["PubMed/comm_use","PubMed/non_comm_use"]

diff --git a/requirements.txt b/requirements.txt
@@ -11,4 +11,5 @@ scipy
 beautifulsoup4
 textblob
 ipywidgets
-multiprocess
+multiprocess
+scholarmetrics==0.2.1
diff --git a/test.md b/test.md
@@ -21,7 +21,7 @@ The dataset was built in several stages. If you want to produce the best possibl
     1.  mongosh --host localhost -u user -p pass
     2.  use contexts
     3.  db.stats_dev.find( { citations_total: { $gt: 0 }, is_plos: true } )
-    4.  db.stats_dev.find( { citations_total: { $gt: 0 }, is_bmc: true } )
+    4.  db.stats_dev.find( { citations_total: { $gt: 0 }, is_pmc: true } )
 
 There should be as many entries as during tests 4.3 and 8.3. In addition, the citations_total field should be equal to the number of files moved minus 1 during steps 3 and 7.
 However, checking by date is a little more complex. First, you need to ensure that the numbers in the `citations_one`, `citations_two` and `citations_three` fields are correct with the `citation_counts` field. This is because `citations_one` contains the same number as the '0' entry in `citation_counts`. `citations_two` must be the sum of '0' and '1' and finally `citations_three`, the sum of '0', '1' and '2'.
@@ -38,7 +38,7 @@ Once this has been done, we can start validating the h_index. As things stand, t
    2. use contexts
    3. db.authors_dev.find( { name: 'YourAuthor'  } )
    4. You should have an h_index of 2
-4. Now take the result of `db.stats_dev.find( { citations_total: { $gt: 0 }, is_bmc: true } )`.
+4. Now take the result of `db.stats_dev.find( { citations_total: { $gt: 0 }, is_pmc: true } )`.
 5. Choose one of the authors of the article and search for it in the global dataset.
 6. Add the article and only one of the articles citing it to your test dataset
 7. Run the following commands in a terminal: