Skip to content

Commit

Permalink
Create vbo_sandbox.ipynb
Browse files Browse the repository at this point in the history
  • Loading branch information
matentzn committed Apr 5, 2024
1 parent c52df4d commit c8c3642
Showing 1 changed file with 100 additions and 0 deletions.
100 changes: 100 additions & 0 deletions src/scripts/dadis_client/vbo_sandbox.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"###\n",
"\n",
"import pandas as pd\n",
"\n",
"outfile = \"dogbreeds_merged.tsv\"\n",
"template_url=\"https://docs.google.com/spreadsheets/d/e/2PACX-1vSuwLXikgq08frK7d8yFSdWTS8P1erx5bS_QiLdHhfKV4ulJlRrqkVaVhC7b3O6Z8ixrvJgoCBy8YLq/pub?gid=1655315858&single=true&output=tsv\"\n",
"term_to_obsolete = \"VBO:0200238\"\n",
"term_to_merge_into = \"VBO:0200239\"\n",
"issue = 123"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"ename": "KeyError",
"evalue": "\"['obsolesence_reason'] not in index\"",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[31], line 42\u001b[0m\n\u001b[1;32m 40\u001b[0m out \u001b[38;5;241m=\u001b[39m merge(df, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVBO:0200238\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVBO:0200239\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 41\u001b[0m out\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdogbreeds_merged.tsv\u001b[39m\u001b[38;5;124m\"\u001b[39m,sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m---> 42\u001b[0m \u001b[43mout\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mobsolesence_reason\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mobsolete\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mterm_label\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mvbo_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mreplacement_term\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mreplacement_label\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mGH_issue\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mobsoletion_type\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcontributors\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msynonym_label_from_merged_term\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msource_for_merged_term\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msynonym_type_most_common_name\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\n",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/grainyhead/lib/python3.11/site-packages/pandas/core/frame.py:4096\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4094\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 4095\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 4096\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 4098\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 4099\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/grainyhead/lib/python3.11/site-packages/pandas/core/indexes/base.py:6200\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m 6197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 6198\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6200\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6202\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m 6203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[1;32m 6204\u001b[0m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n",
"File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/grainyhead/lib/python3.11/site-packages/pandas/core/indexes/base.py:6252\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m 6249\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6251\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[0;32m-> 6252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"\u001b[0;31mKeyError\u001b[0m: \"['obsolesence_reason'] not in index\""
]
}
],
"source": [
"def merge(df, term_to_obsolete, term_to_merge_into, issue=1):\n",
" df_term_to_merge_into = df[df[\"vbo_id\"]==term_to_merge_into].copy()\n",
" df_term_to_obsolete = df[df[\"vbo_id\"]==term_to_obsolete].copy()\n",
" df_term_to_obsolete2 = df_term_to_obsolete.copy()\n",
" \n",
" term_to_merge_into_label = df_term_to_merge_into[\"term_label\"].values[0]\n",
" term_to_obsolete_label = df_term_to_obsolete[\"term_label\"].values[0]\n",
" \n",
" list_of_columns_to_keep = [\"term_label\", \"vbo_id\", \"contributor\"]\n",
" for column in df_term_to_obsolete.columns:\n",
" if column not in list_of_columns_to_keep:\n",
" df_term_to_obsolete[column] = ''\n",
" \n",
" df_term_to_obsolete['term_label'] = \"obsolete \" + df_term_to_obsolete['term_label']\n",
" df_term_to_obsolete['obsoletion_type'] = \"owl:Class\"\n",
" df_term_to_obsolete['obsolete'] = \"true\"\n",
" df_term_to_obsolete['contributors'] = \"\"\n",
" df_term_to_obsolete['replacement_term'] = term_to_merge_into\n",
" df_term_to_obsolete['replacement_label'] = term_to_merge_into_label\n",
" df_term_to_obsolete['obsolescence_reason'] = \"terms merged\"\n",
" df_term_to_obsolete['GH_issue'] = \"https://github.com/monarch-initiative/vertebrate-breed-ontology/issues/{issue}\".format(issue=issue)\n",
" \n",
" # Update duplicated\n",
" df_term_to_obsolete2['vbo_id'] = term_to_merge_into\n",
" df_term_to_obsolete2['term_label'] = ''\n",
" df_term_to_obsolete2['synonym_label_from_merged_term'] = term_to_obsolete_label\n",
" df_term_to_obsolete2['source_for_merged_term'] = term_to_obsolete\n",
" df_term_to_obsolete2['synonym_type_most_common_name'] = ''\n",
" df_term_to_obsolete2['GH_issue'] = \"https://github.com/monarch-initiative/vertebrate-breed-ontology/issues/{issue}\".format(issue=issue) \n",
"\n",
" return pd.concat([df_term_to_obsolete, df_term_to_obsolete2, df_term_to_merge_into])\n",
"\n",
"df=pd.read_csv(template_url,sep=\"\\t\")\n",
"out = merge(df, term_to_obsolete, term_to_merge_into,issue=issue)\n",
"out.to_csv(outfile,sep=\"\\t\",index=False)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

0 comments on commit c8c3642

Please sign in to comment.