diff --git a/src/scripts/dadis_client/vbo_sandbox.ipynb b/src/scripts/dadis_client/vbo_sandbox.ipynb new file mode 100644 index 0000000..f901406 --- /dev/null +++ b/src/scripts/dadis_client/vbo_sandbox.ipynb @@ -0,0 +1,100 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "###\n", + "\n", + "import pandas as pd\n", + "\n", + "outfile = \"dogbreeds_merged.tsv\"\n", + "template_url=\"https://docs.google.com/spreadsheets/d/e/2PACX-1vSuwLXikgq08frK7d8yFSdWTS8P1erx5bS_QiLdHhfKV4ulJlRrqkVaVhC7b3O6Z8ixrvJgoCBy8YLq/pub?gid=1655315858&single=true&output=tsv\"\n", + "term_to_obsolete = \"VBO:0200238\"\n", + "term_to_merge_into = \"VBO:0200239\"\n", + "issue = 123" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"['obsolesence_reason'] not in index\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[31], line 42\u001b[0m\n\u001b[1;32m 40\u001b[0m out \u001b[38;5;241m=\u001b[39m merge(df, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVBO:0200238\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mVBO:0200239\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 41\u001b[0m out\u001b[38;5;241m.\u001b[39mto_csv(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdogbreeds_merged.tsv\u001b[39m\u001b[38;5;124m\"\u001b[39m,sep\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\t\u001b[39;00m\u001b[38;5;124m\"\u001b[39m,index\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n\u001b[0;32m---> 42\u001b[0m \u001b[43mout\u001b[49m\u001b[43m[\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mobsolesence_reason\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mobsolete\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mterm_label\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mvbo_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mreplacement_term\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mreplacement_label\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mGH_issue\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mobsoletion_type\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mcontributors\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msynonym_label_from_merged_term\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msource_for_merged_term\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msynonym_type_most_common_name\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m]\u001b[49m\n", + "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/grainyhead/lib/python3.11/site-packages/pandas/core/frame.py:4096\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m 4094\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_iterator(key):\n\u001b[1;32m 4095\u001b[0m key \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(key)\n\u001b[0;32m-> 4096\u001b[0m indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcolumns\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_get_indexer_strict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkey\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcolumns\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m 4098\u001b[0m \u001b[38;5;66;03m# take() does not accept boolean indexers\u001b[39;00m\n\u001b[1;32m 4099\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mgetattr\u001b[39m(indexer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdtype\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;28;01mNone\u001b[39;00m) \u001b[38;5;241m==\u001b[39m \u001b[38;5;28mbool\u001b[39m:\n", + "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/grainyhead/lib/python3.11/site-packages/pandas/core/indexes/base.py:6200\u001b[0m, in \u001b[0;36mIndex._get_indexer_strict\u001b[0;34m(self, key, axis_name)\u001b[0m\n\u001b[1;32m 6197\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 6198\u001b[0m keyarr, indexer, new_indexer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_reindex_non_unique(keyarr)\n\u001b[0;32m-> 6200\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_raise_if_missing\u001b[49m\u001b[43m(\u001b[49m\u001b[43mkeyarr\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mindexer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43maxis_name\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 6202\u001b[0m keyarr \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtake(indexer)\n\u001b[1;32m 6203\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(key, Index):\n\u001b[1;32m 6204\u001b[0m \u001b[38;5;66;03m# GH 42790 - Preserve name from an Index\u001b[39;00m\n", + "File \u001b[0;32m~/.pyenv/versions/3.11.7/envs/grainyhead/lib/python3.11/site-packages/pandas/core/indexes/base.py:6252\u001b[0m, in \u001b[0;36mIndex._raise_if_missing\u001b[0;34m(self, key, indexer, axis_name)\u001b[0m\n\u001b[1;32m 6249\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mNone of [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mkey\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m] are in the [\u001b[39m\u001b[38;5;132;01m{\u001b[39;00maxis_name\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m]\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 6251\u001b[0m not_found \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(ensure_index(key)[missing_mask\u001b[38;5;241m.\u001b[39mnonzero()[\u001b[38;5;241m0\u001b[39m]]\u001b[38;5;241m.\u001b[39munique())\n\u001b[0;32m-> 6252\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mnot_found\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m not in index\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "\u001b[0;31mKeyError\u001b[0m: \"['obsolesence_reason'] not in index\"" + ] + } + ], + "source": [ + "def merge(df, term_to_obsolete, term_to_merge_into, issue=1):\n", + " df_term_to_merge_into = df[df[\"vbo_id\"]==term_to_merge_into].copy()\n", + " df_term_to_obsolete = df[df[\"vbo_id\"]==term_to_obsolete].copy()\n", + " df_term_to_obsolete2 = df_term_to_obsolete.copy()\n", + " \n", + " term_to_merge_into_label = df_term_to_merge_into[\"term_label\"].values[0]\n", + " term_to_obsolete_label = df_term_to_obsolete[\"term_label\"].values[0]\n", + " \n", + " list_of_columns_to_keep = [\"term_label\", \"vbo_id\", \"contributor\"]\n", + " for column in df_term_to_obsolete.columns:\n", + " if column not in list_of_columns_to_keep:\n", + " df_term_to_obsolete[column] = ''\n", + " \n", + " df_term_to_obsolete['term_label'] = \"obsolete \" + df_term_to_obsolete['term_label']\n", + " df_term_to_obsolete['obsoletion_type'] = \"owl:Class\"\n", + " df_term_to_obsolete['obsolete'] = \"true\"\n", + " df_term_to_obsolete['contributors'] = \"\"\n", + " df_term_to_obsolete['replacement_term'] = term_to_merge_into\n", + " df_term_to_obsolete['replacement_label'] = term_to_merge_into_label\n", + " df_term_to_obsolete['obsolescence_reason'] = \"terms merged\"\n", + " df_term_to_obsolete['GH_issue'] = \"https://github.com/monarch-initiative/vertebrate-breed-ontology/issues/{issue}\".format(issue=issue)\n", + " \n", + " # Update duplicated\n", + " df_term_to_obsolete2['vbo_id'] = term_to_merge_into\n", + " df_term_to_obsolete2['term_label'] = ''\n", + " df_term_to_obsolete2['synonym_label_from_merged_term'] = term_to_obsolete_label\n", + " df_term_to_obsolete2['source_for_merged_term'] = term_to_obsolete\n", + " df_term_to_obsolete2['synonym_type_most_common_name'] = ''\n", + " df_term_to_obsolete2['GH_issue'] = \"https://github.com/monarch-initiative/vertebrate-breed-ontology/issues/{issue}\".format(issue=issue) \n", + "\n", + " return pd.concat([df_term_to_obsolete, df_term_to_obsolete2, df_term_to_merge_into])\n", + "\n", + "df=pd.read_csv(template_url,sep=\"\\t\")\n", + "out = merge(df, term_to_obsolete, term_to_merge_into,issue=issue)\n", + "out.to_csv(outfile,sep=\"\\t\",index=False)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}