diff --git a/analysis.ipynb b/analysis.ipynb new file mode 100644 index 0000000..4f1e6b7 --- /dev/null +++ b/analysis.ipynb @@ -0,0 +1,560 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "March 6, 2022\n", + "\n", + "**Analysis**\n", + "## Identifying science in the news: An assessment of the precision and recall of Altmetric.com news mention data\n", + "\n", + "_Juan Pablo Alperin, ScholCommLab/School of Publishing, Simon Fraser University_\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Related Publication:**\n", + "\n", + "Fleerackers, A., Nehrig, L., Maggio, L.A., Enkhbayar, A., Moorhead, L., Alperin, J.P., (2022). Identifying science in the news: An assessment of the precision and recall of Altmetric.com news mention data. _arXiv_\n", + "\n", + "\n", + "**Related Data:**\n", + "\n", + "Fleerackers, Alice; Nehring, Lise; Alperin, Juan Pablo; Enkhbayar, Asura; Maggio, Lauren A.; Moorhead, Laura, 2022, \"Replication data for Identifying science in the news\", [https://doi.org/10.7910/DVN/WNDOFL](https://doi.org/10.7910/DVN/WNDOFL), _Harvard Dataverse_, V1, UNF:6:k9Hv0lysKrB+tQLkdOEZOw== [fileUNF] " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set this flag to True if you want to download the dataset\n", + "download_files = True\n", + "\n", + "if download_files: \n", + " import os\n", + " from pyDataverse.api import NativeApi, DataAccessApi\n", + " from pyDataverse.models import Dataverse\n", + " \n", + " if not os.path.exists('data/'):\n", + " os.makedirs('data/')\n", + "\n", + " base_url = 'https://dataverse.harvard.edu/'\n", + "\n", + " api = NativeApi(base_url)\n", + " data_api = DataAccessApi(base_url)\n", + "\n", + "\n", + " DOI = \"doi:10.7910/DVN/WNDOFL\"\n", + " dataset = api.get_dataset(DOI)\n", + " \n", + " files_list = dataset.json()['data']['latestVersion']['files']\n", + "\n", + " for file in files_list:\n", + " filename = file[\"dataFile\"][\"filename\"]\n", + " file_id = file[\"dataFile\"][\"id\"]\n", + " print(\"File name {}, id {}\".format(filename, file_id))\n", + "\n", + " response = data_api.get_datafile(file_id)\n", + " with open('data/' + filename, \"wb\") as f:\n", + " f.write(response.content)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# some helper functions to later help match URLs\n", + "def clean_url(url):\n", + " return url.split('?')[0].strip('/').lower()\n", + "\n", + "def clean_doi(doi):\n", + " try: \n", + " if type(doi) == int:\n", + " doi = str(doi)\n", + " return doi.strip('/. ').lower()\n", + " except:\n", + " return np.nan\n", + " \n", + "def make_bool(x):\n", + " if x == 0 or x == 'No':\n", + " return False\n", + " if x == 1 or x == 'Yes':\n", + " return True\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "id_cols = ['DOI', 'ISBN', 'clinical_trial_id', 'URI', 'pubmed_id', 'pmc_id', 'handle', 'ads_bibcode', 'arxiv_id', 'repec_id', 'SSRN', 'URN']\n", + "dtypes = {c: str for c in id_cols}\n", + "\n", + "# Data downloaded from Altmetric Explorer on Sept 9, 2021\n", + "# Filter for all research mentions in the following outlets since March 1, 2021:\n", + "# The Guardian, HealthDay, IFLScience, MedPage Today, News Medical, New York Times, Popular Science, and Wired\n", + "alt = pd.read_table('data/altmetric_dataset.tab', sep=\"\\t\", dtype=dtypes, encoding='utf8')\n", + "alt['URL'] = alt.URL.map(clean_url)\n", + "alt['DOI'] = alt.DOI.map(clean_doi)\n", + "alt.loc[:,'alt_id'] = alt.index\n", + "\n", + "alt['outlet'] = alt.outlet.map(lambda x: x.strip())\n", + "\n", + "alt.loc[:,id_cols] = alt.loc[:,id_cols].applymap(lambda x: x.lower() if x == x else x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# We found some errors in the identifiers of the Altmetric data that we could obviously correct manually\n", + "url = 'https://www.iflscience.com/space/european-satellite-finds-12-very-rare-einstein-crosses'\n", + "i = '2012'\n", + "alt.loc[(alt.URL == url) & (alt.arxiv_id == i), 'arxiv_id'] = '2012.10051'\n", + "\n", + "url = 'https://www.wired.com/story/mathematicians-settle-the-erdos-coloring-conjecture'\n", + "i = '2101'\n", + "alt.loc[(alt.URL == url) & (alt.arxiv_id == i), 'arxiv_id'] = '2101.04698'\n", + "\n", + "url = 'https://www.news-medical.net/news/20210421/research-offers-new-insights-on-the-significance-of-hyperinflammation-following-sars-cov-2-infection.aspx'\n", + "i = '10.1002/(issn)1529-0131'\n", + "alt.loc[(alt.URL == url) & (alt.DOI == i), 'DOI'] = '10.1002/art.41763'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Some code we used to clean up and standarize the data we coded**\n", + "\n", + "**Final output file in published dataset**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# gold = pd.read_excel('dataset/gold.xlsx', engine='openpyxl')\n", + "\n", + "# gold['URL'] = gold.URL.map(clean_url)\n", + "# gold['DOI'] = gold.DOI.map(clean_doi)\n", + "\n", + "# gold.loc[:,'outlet'] = gold.outlet.str.strip()\n", + "# gold.loc[:,code_cols] = gold.loc[:,code_cols].applymap(make_bool)\n", + "# gold.rename(columns={'DOI': 'identifier'}, inplace=True)\n", + "\n", + "# gold.loc[:,'identifier'] = gold.identifier\n", + "\n", + "# s = 'arxiv:'\n", + "# gold.loc[gold.identifier.str.startswith(s, na=False),'identifier'] = gold.loc[gold.identifier.str.startswith(s, na=False),'identifier'].map(lambda x: x[len(s):])\n", + "# s = 'pmid: '\n", + "# gold.loc[gold.identifier.str.startswith(s, na=False),'identifier'] = gold.loc[gold.identifier.str.startswith(s, na=False),'identifier'].map(lambda x: x[len(s):])\n", + "\n", + "# gold['ResearchMentioned'] = gold.identifier.notnull()\n", + "# gold['gold_id'] = gold.index\n", + "\n", + "# gold.loc[:,code_cols] = gold.loc[:,code_cols].applymap(lambda x: int(x) if type(x) == bool else None)\n", + "\n", + "# # lowercase pmc and crinical trial\n", + "# gold.to_excel('content_analysis_dataset.csv', index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read in coded data\n", + "code_cols = ['Aggregated', 'PressRelease', 'ResearchMentioned', 'DescribesAsresearch', 'HasLink', 'JournalMentioned', 'AuthorMentioned', 'InstitutionMentioned', 'StudyDateMentioned']\n", + "gold = pd.read_table('data/content_analysis_dataset.tab')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table 1. Number of stories and mentions across news outlets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tmp = gold[gold.ResearchMentioned == 1].groupby(['outlet'])['URL'].agg(['nunique', 'size'])\n", + "tmp['n'] = gold.groupby(['outlet'])['URL'].nunique().astype(int)\n", + "tmp['pct'] = tmp['nunique'].divide(tmp['n']).multiply(100).round(0)\n", + "tmp['avg'] = tmp['size'].divide(tmp['nunique']).round(1)\n", + "tmp = tmp[['n', 'nunique', 'pct', 'size', 'avg']]\n", + "tmp.loc['Total'] = tmp.agg({'n': 'sum',\n", + " 'nunique': 'sum',\n", + " 'pct': 'mean',\n", + " 'size': 'sum',\n", + " 'avg': 'mean'})\n", + "tmp.loc[:,['n', 'nunique', 'pct', 'size']] = tmp.loc[:,['n', 'nunique', 'pct', 'size']].astype(int)\n", + "tmp['n'] = tmp['n'].astype(int)\n", + "tmp['nunique'] = tmp['nunique'].astype(int)\n", + "tmp['size'] = tmp['size'].astype(int)\n", + "tmp['avg'] = tmp['avg'].round(1)\n", + "\n", + "tmp.columns = ['Num Stories', 'Num Stories w/ Mentions', 'Percent Stories w/ Mentions', 'Num Mentions', 'Average Mentions / Story']\n", + "\n", + "tmp.to_clipboard()\n", + "tmp" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table 2. How research was mentioned across news outlets" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "mention_cols = ['DescribesAsresearch', 'HasLink', 'JournalMentioned', 'AuthorMentioned', 'InstitutionMentioned', 'StudyDateMentioned']\n", + "def pct(x):\n", + " return \"{:.0f}\".format(100*x.sum()/len(x))\n", + "\n", + "agg = {x: ['sum', pct] for x in mention_cols}\n", + "agg['URL'] = 'count'\n", + "\n", + "df = gold[gold.ResearchMentioned == True].groupby('outlet').agg(agg)\n", + "total = gold[gold.ResearchMentioned == True].groupby('ResearchMentioned').agg(agg)\n", + "total.index = ['Total']\n", + "df = df.append(total)\n", + "df = df.astype(int)\n", + "# columns = []\n", + "# for i, c in enumerate(df.columns): \n", + "# x = c[0]\n", + "# if i % 2 == 1: \n", + "# x = ''\n", + "# if c[1] == 'sum':\n", + "# y = 'Number'\n", + "# elif c[1] == 'pct':\n", + "# y = '%'\n", + "# elif c[1] == 'nunique':\n", + "# y = 'N'\n", + "# columns.append(\\n.join([x,y]))\n", + "# df.columns = columns\n", + "df = df.rename(columns={'sum': 'Number', 'pct': '%', 'count': 'Num Mentions'}, level=1)\n", + " \n", + "df.to_clipboard()\n", + "df\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# id_cols = ['DOI', 'ISBN', 'clinical_trial_id', 'URI', 'pubmed_id', 'pmc_id', 'handle', 'ads_bibcode', 'arxiv_id', 'repec_id', 'SSRN', 'URN']\n", + "# alt.loc[:,id_cols] = alt.loc[:,id_cols].applymap(lambda x: x.lower() if type(x) == str else x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "gold_wm = gold[gold.identifier.notnull()]\n", + "N = len(gold_wm)\n", + "print(\"Gold dataset has {} mentions\".format(N))\n", + "\n", + "altmetric_urls = set(alt.URL)\n", + "could_match = gold_wm[gold_wm.URL.isin(altmetric_urls)]\n", + "n = could_match.shape[0]\n", + "print(\"Of those, {} ({:.0f}%) could have ID match (URLs in altmetric)\".format(n, n*100/N))\n", + "\n", + "for i, identifier in enumerate(id_cols):\n", + " df2 = alt[['URL', 'alt_id', identifier]].copy()\n", + " df2.columns = ['URL', 'alt_id', 'matched_alt_id']\n", + " df2.loc[:,'matched_id_type'] = identifier\n", + " merged = gold_wm.merge(df2, left_on=['URL', 'identifier'], right_on=['URL', 'matched_alt_id'])\n", + " if i == 0: \n", + " matched = merged\n", + " else:\n", + " matched = matched.append(merged)\n", + "\n", + "print(\"Removing {} duplicate matches.\".format(matched.duplicated(subset=['gold_id']).sum()))\n", + "\n", + "matched.drop_duplicates(subset=['gold_id'], inplace=True) # shouldn't happen, but altmetric has duplicates sometimes\n", + "\n", + "print(\"Of the {}, {} ({:.0f}%) have a match\".format(N, matched.shape[0], matched.shape[0]*100/N))\n", + "\n", + "our_urls_mentions = set(gold[gold.ResearchMentioned == True].URL)\n", + "no_chance_match = gold_wm[gold_wm.URL.isin(our_urls_mentions.difference(altmetric_urls))]\n", + "n = no_chance_match.shape[0]\n", + "print(\"Of the {}, {} ({:.0f}%) have no chance of matching (URLs not in altmetric)\".format(N, n, n*100/N))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "alt_errors = alt[(alt.URL.isin(gold[gold.ResearchMentioned == True].URL)) & (~alt.alt_id.isin(matched.alt_id))]\n", + "\n", + "# errs = ['https://www.iflscience.com/health-and-medicine/cannabis-use-early-in-life-linked-to-some-changes-in-heart-and-artery-function', \n", + "# 'https://www.nytimes.com/2021/03/14/health/covid-schools-social-distancing-3-feet.html',\n", + "# 'https://www.nytimes.com/2021/04/01/health/pandemics-plague-history-resilience.html',\n", + "# 'https://www.nytimes.com/2021/04/07/science/particle-physics-muon-fermilab-brookhaven.html']\n", + "\n", + "# news = ['https://consumer.healthday.com/b-3-31-too-few-minorities-in-u-s-health-care-workforce-report-2651245191.html', \n", + "# 'https://www.iflscience.com/environment/bitcoin-mining-will-soon-pump-out-more-carbon-than-czech-republic-new-study-says', \n", + "# 'https://www.iflscience.com/health-and-medicine/male-fertility-how-everyday-chemicals-are-destroying-sperm-counts-in-humans-and-animals', \n", + "# 'https://www.iflscience.com/plants-and-animals/-a-surprising-number-of-sea-monster-sightings-can-be-explained-by-whale-erections', \n", + "# 'https://www.popsci.com/story/health/astrazeneca-vaccine-blood-clots', \n", + "# 'https://www.popsci.com/story/health/how-vaccine-passport-works', \n", + "# 'https://www.popsci.com/story/health/lyme-disease-treatment-for-humans', \n", + "# 'https://www.theguardian.com/society/2021/apr/02/covid-further-rare-blood-clot-cases-found-in-oxford-astrazeneca-recipients', \n", + "# 'https://www.theguardian.com/society/2021/apr/11/is-vaccinating-against-covid-enough-what-we-can-learn-from-other-countries', \n", + "# 'https://www.wired.com/story/blue-carbon-credits-could-help-restore-ecosystems', \n", + "# 'https://www.wired.com/story/how-cargo-ships-could-help-detect-tsunamis', \n", + "# 'https://www.wired.com/story/how-to-kill-a-zombie-fire']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def _f_score_helper(outlet=False):\n", + " if outlet: \n", + " g = gold[gold.outlet == outlet]\n", + " a = alt[alt.outlet == outlet]\n", + " m = matched[matched.outlet == outlet]\n", + " e = alt_errors[alt_errors.outlet == outlet]\n", + " else:\n", + " g = gold\n", + " a = alt\n", + " m = matched\n", + " e = alt_errors\n", + " \n", + " return (g,a,m,e)\n", + "\n", + "def _f_score_calculation(tp, fp, fn):\n", + " precision = tp / (tp + fp)\n", + " recall = tp / (tp + fn)\n", + " f_score = 2 * (precision * recall) / (precision + recall)\n", + " \n", + " return (precision, recall, f_score)\n", + "\n", + " \n", + "## BY MENTION\n", + "\n", + "# TRUE POSITIVE: tp = altmetric says yes, we say yes \n", + "# FALSE POSITIVE: fp = altmetric says yes, but we say no \n", + "# FALSE NEGATIVE: fn = almetric says no, we say yes\n", + "def f_score_urls(outlet=False, display=False): \n", + " g,a,m,e = _f_score_helper(outlet)\n", + " \n", + " our_urls_all = set(g.URL)\n", + " our_urls_mentions = set(g[g.ResearchMentioned == True].URL)\n", + " our_urls_no_mentions = set(g[~g.ResearchMentioned == False].URL)\n", + " our_urls_no_mentions = our_urls_no_mentions.difference(our_urls_mentions) # there's a couple (2) in both sets, because of uncoded duplicates. Remove.\n", + " \n", + " altmetric_urls = set(a.URL)\n", + " \n", + " tp = len(altmetric_urls.intersection(our_urls_mentions))\n", + " fp = len(altmetric_urls.intersection(our_urls_no_mentions))\n", + " fn = len(our_urls_mentions.difference(altmetric_urls)) \n", + " \n", + " precision, recall, f_score = _f_score_calculation(tp, fp, fn)\n", + " \n", + " if display:\n", + " print(\"N = {}\".format(len(our_urls_mentions)))\n", + " print(\"True Positive: {}\".format(tp))\n", + " print(\"False Positive: {}\".format(fp))\n", + " print(\"False Negative: {}\".format(fn))\n", + " print(\"Precision: {:.2f}\".format(precision))\n", + " print(\"Recall: {:.2f}\".format(recall))\n", + " print(\"F-score: {:.2f}\".format(f_score))\n", + "\n", + " return (f_score, precision, recall) \n", + " \n", + "def f_score_mentions(outlet=False, display=False):\n", + " g,a,m,e = _f_score_helper(outlet)\n", + " \n", + " tp = m.shape[0]\n", + " fp = e.shape[0]\n", + " # fp = len(errs)\n", + " fn = g[g.identifier.notnull()].shape[0] - m.shape[0]\n", + "\n", + " (precision, recall, f_score) = _f_score_calculation(tp, fp, fn)\n", + " \n", + " if display:\n", + " if outlet: \n", + " print(\"Outlet: {}\".format(outlet))\n", + " print(\"True Positive: {}\".format(tp))\n", + " print(\"False Positive: {}\".format(fp))\n", + " print(\"False Negative: {}\".format(fn))\n", + " print(\"Precision: {:.2f}\".format(precision))\n", + " print(\"Recall: {:.2f}\".format(recall))\n", + " print(\"F-score: {:.2f}\".format(f_score))\n", + "\n", + " return (precision, recall, f_score)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Optional code to create an excel sheet with the errors found" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# writer = pd.ExcelWriter('errors.xlsx', engine='openpyxl')\n", + "# alt_errors.to_excel(writer, 'False Positives')\n", + "# gold[(gold.identifier.notnull()) & ~(gold.gold_id.isin(matched.gold_id))].to_excel(writer, 'False Negatives')\n", + "# writer.save()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Table 3. Precision, Recall, and Accuracy (F-score) by news outlet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(gold.groupby('outlet').size())\n", + "df.columns = ['N (mentions)']\n", + "df['scores'] = df.index.map(lambda x: f_score_mentions(x, False))\n", + "df['Precision'] = df.scores.map(lambda x: \"{:.2f}\".format(x[0]))\n", + "df['Recall'] = df.scores.map(lambda x: \"{:.2f}\".format(x[1]))\n", + "df['F-Score'] = df.scores.map(lambda x: \"{:.2f}\".format(x[2]))\n", + "del df['scores']\n", + "# df['scores'] = df['F-score'].map(lambda x: \"{:.2f}\".format(x))\n", + "df.to_clipboard()\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = gold[gold.gold_id.isin(matched.gold_id)]\n", + "df = df.append(gold[gold.gold_id.isin(no_chance_match.gold_id)])\n", + "df['match'] = df.gold_id.isin(matched.gold_id)\n", + "df['match'] = df.match.astype(int)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Logit Model " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import statsmodels.api as sm\n", + "import statsmodels.formula.api as smf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = smf.logit(formula=\"match ~ DescribesAsresearch + HasLink + JournalMentioned + AuthorMentioned + InstitutionMentioned + StudyDateMentioned\", data=df)\n", + "res = model.fit()\n", + "\n", + "res.params\n", + "odds = np.exp(res.params['HasLink'])\n", + "prob = '{:.1f}'.format(odds / (1 + odds))\n", + "\n", + "print(res.summary())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "params = res.params\n", + "conf = res.conf_int()\n", + "conf['Odds Ratio'] = params\n", + "conf.columns = ['5%', '95%', 'Odds Ratio']\n", + "print(np.exp(conf))\n", + "\n", + "\n", + "np.exp(conf).to_clipboard()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}