Skip to content

Commit

Permalink
Merge pull request #6 from MediaComem/update-h-index-calculation
Browse files Browse the repository at this point in the history
Update h index calculation
  • Loading branch information
slecorne authored Apr 22, 2024
2 parents 747c190 + 55aa586 commit 057fa90
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 63 deletions.
44 changes: 0 additions & 44 deletions dataset/calculate_h_index.py

This file was deleted.

64 changes: 50 additions & 14 deletions dataset/calculate_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re, math
import numpy as np
import logging
from scholarmetrics import hindex
logging.basicConfig(filename='logs/stats.log',filemode="w+",level=logging.INFO)
logger = logging.getLogger("Main")

Expand Down Expand Up @@ -117,10 +118,6 @@
"year":year,"month":month,"has_month":has_month,"is_plos":record["is_plos"],"is_bmc":record["is_bmc"],"is_pmc":record["is_pmc"],"has_das":record["has_das"],"authors":paper_authors,"authors_full":paper_authors_full}
counter += 1
print("End of record")
# create author matrices
for author in rev_authors_dict.keys():
authors_citations[author] = np.zeros(authors_n_publications[author])
print("End of matrice")
logger.info("Finished creating dictionaries")
# extract citation histories for indicators
for record in collection_records.find():
Expand Down Expand Up @@ -153,9 +150,14 @@
# add citation to all paper authors
date_index = records[ref_counter]["year"]*12 + records[ref_counter]["month"] - min_date
for a in records[ref_counter]["authors"]:
if not a in authors_citations:
authors_citations[a] = dict()
if not ref_counter in authors_citations[a]:
authors_citations[a][ref_counter] = dict()
if not date_index in authors_citations[a][ref_counter]:
authors_citations[a][ref_counter][date_index] = 0
# a is the author_counter
author_pub_index = authors_publications[a].index(ref_counter)
authors_citations[a][author_pub_index] += 1
authors_citations[a][ref_counter][date_index] += 1
print("End of computation citations")
for k,v in records.items():
# sort year indexes and convert them into strings for Mongo
Expand All @@ -169,6 +171,29 @@
v["citations_one"] = sum([x for y,x in citations[k].items() if y < 1])
v["citations_two"] = sum([x for y,x in citations[k].items() if y < 2])
v["citations_three"] = sum([x for y,x in citations[k].items() if y < 3])
date_index = v["year"] * 12 + v["month"] - min_date
# Compute h-index for authors at time of publication
h_indexes = list()
for a in v["authors"]:
h_index = 0
local_cit_counts = dict()
if a in authors_citations:
for index, value in authors_citations[a].items():
for date, count in value.items():
# We took only citations before publication date
if date < date_index:
if not index in local_cit_counts:
local_cit_counts[index] = count
else:
local_cit_counts[index] += count
arrayOfCitation = list(local_cit_counts.values())
arrayOfCitation = sorted(arrayOfCitation, reverse=True)
h_index = hindex(arrayOfCitation)
else:
authors_citations[a] = dict()

h_indexes.append(int(h_index))
v["h_indexes"] = h_indexes

logger.info("Finished parsing all records")
print("End of parsing")
Expand All @@ -177,16 +202,27 @@
authors_dump = list()
for k,v in authors_full.items():
A = authors_citations[k]
a = {"index":k,"name":v,"tot_cit":A.sum()}
total_cit_cout = 0
h_index = 0
local_cit_counts = list(A)
for p in local_cit_counts:
local_pub_counts = len([x for x in local_cit_counts if x >= p])
if local_pub_counts >= h_index and p >= h_index:
h_index = min(p, local_pub_counts)
publication_cit_count = list(A)
local_cit_counts = dict()
# Compute global h-index
for index, publication in A.items():
local_cit_counts[index] = 0
for date, count in publication.items():
total_cit_cout += count
if index in local_cit_counts:
local_cit_counts[index] += count
arrayOfCitation = list(local_cit_counts.values())
arrayOfCitation = sorted(arrayOfCitation, reverse=True)
h_index = hindex(arrayOfCitation)
a = {"index":k,"name":v,"tot_cit":total_cit_cout}
a_citations = list()
for x,y in zip(local_cit_counts,authors_publications[k]):
a_citations.append({"title":records[y]["title"],"year":records[y]["year"],"publication_id":records[y]["publication_id"],"paper_id":y,"n_cit":x})
for y in authors_publications[k]:
cit = 0
if y in local_cit_counts:
cit = local_cit_counts[y]
a_citations.append({"title":records[y]["title"],"year":records[y]["year"],"publication_id":records[y]["publication_id"],"paper_id":y,"n_cit": cit})
a["h_index"] = int(h_index)
a["publications"] = a_citations
authors_dump.append(a)
Expand Down
Binary file modified dataset/exports/export_plos.csv.zip
Binary file not shown.
4 changes: 2 additions & 2 deletions dataset/parser_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
# logs and basics
logging.basicConfig(filename='logs/parser_main.log',filemode="w+",level=logging.INFO)
logger = logging.getLogger("Main")
plos_file = 'config/PLOS-v5.2.csv'
plos_file = 'config/PLOS_v5.2.csv'
pmc_file = 'config/PMC_v5.2.csv'
plos_csv_header=['Journal','Publisher','Submission_Day','Submission_Month','Submission_Year','Acceptance_Day','Acceptance_Month','Acceptance_Year','Publication_Day','Publication_Month','Publication_Year','DOI','Type_of_Article','Article_Title','Major.Mesh.Terms','Country','Corresponding_Author_Institution','Funding_Statement','Data_Section.Text.Generated','Data_Generated','Data_Section.Text.Shared','Data_Shared','Data.Location','DA_data','Accessions','URL_data','Repositories_data','Data_DOIs','Preprint_Match','Preprint_DOI','Preprint_Title','Preprint_Authors','Preprint_Day','Preprint_Month','Preprint_Year','Preprint_URL','Preprint_Server','Code_Section.Text.Generated','Code_Generated','Code_Section.Text.Shared','Code_Shared','Code_Location','URL_code','Repositories_code','Quarter','ANZSRC']
JOB_LIMIT = 50000 # controls how many articles to process per batch
Expand Down Expand Up @@ -55,7 +55,7 @@ def get_doi_from_file(filename):

if __name__ == "__main__":

root_dirs = ["dev_set_2"] # if you want to use a small, dev set sampled using sample_dev_set.py
root_dirs = ["dev_set"] # if you want to use a small, dev set sampled using sample_dev_set.py
if MODE:
# change this folder to where your PubMed OA dump actually is. You can list multiple folders, all xml files within will be processed
root_dirs = ["PubMed/comm_use","PubMed/non_comm_use"]
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ scipy
beautifulsoup4
textblob
ipywidgets
multiprocess
multiprocess
scholarmetrics==0.2.1
4 changes: 2 additions & 2 deletions test.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ The dataset was built in several stages. If you want to produce the best possibl
1. mongosh --host localhost -u user -p pass
2. use contexts
3. db.stats_dev.find( { citations_total: { $gt: 0 }, is_plos: true } )
4. db.stats_dev.find( { citations_total: { $gt: 0 }, is_bmc: true } )
4. db.stats_dev.find( { citations_total: { $gt: 0 }, is_pmc: true } )

There should be as many entries as during tests 4.3 and 8.3. In addition, the citations_total field should be equal to the number of files moved minus 1 during steps 3 and 7.
However, checking by date is a little more complex. First, you need to ensure that the numbers in the `citations_one`, `citations_two` and `citations_three` fields are correct with the `citation_counts` field. This is because `citations_one` contains the same number as the '0' entry in `citation_counts`. `citations_two` must be the sum of '0' and '1' and finally `citations_three`, the sum of '0', '1' and '2'.
Expand All @@ -38,7 +38,7 @@ Once this has been done, we can start validating the h_index. As things stand, t
2. use contexts
3. db.authors_dev.find( { name: 'YourAuthor' } )
4. You should have an h_index of 2
4. Now take the result of `db.stats_dev.find( { citations_total: { $gt: 0 }, is_bmc: true } )`.
4. Now take the result of `db.stats_dev.find( { citations_total: { $gt: 0 }, is_pmc: true } )`.
5. Choose one of the authors of the article and search for it in the global dataset.
6. Add the article and only one of the articles citing it to your test dataset
7. Run the following commands in a terminal:
Expand Down

0 comments on commit 057fa90

Please sign in to comment.