diff --git a/notebooks/NOAA/03_1-96_create_taxalist.ipynb b/notebooks/NOAA/03_1-96_create_taxalist.ipynb index 17b796b2e..f55865dcd 100644 --- a/notebooks/NOAA/03_1-96_create_taxalist.ipynb +++ b/notebooks/NOAA/03_1-96_create_taxalist.ipynb @@ -79,14 +79,16 @@ { "cell_type": "markdown", "id": "1763eb3f", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ "## Create NOAA taxa list" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 28, "id": "b55963d8", "metadata": {}, "outputs": [ @@ -146,7 +148,7 @@ " 2\n", " NOAA/DSDP_core_data/61/462/b_forams.csv\n", " taxa\n", - " benthic_foraminfera\n", + " benthic_forams\n", " 61\n", " 462\n", " \n", @@ -154,7 +156,7 @@ " 3\n", " NOAA/DSDP_core_data/61/462/p_forams.csv\n", " taxa\n", - " planktic_foraminfera\n", + " planktic_forams\n", " 61\n", " 462\n", " \n", @@ -171,12 +173,12 @@ "" ], "text/plain": [ - " path type taxon_group \\\n", - "0 NOAA/DSDP_core_data/61/462/radiolar.csv taxa radiolarians \n", - "1 NOAA/DSDP_core_data/61/462/ageprof.csv age NaN \n", - "2 NOAA/DSDP_core_data/61/462/b_forams.csv taxa benthic_foraminfera \n", - "3 NOAA/DSDP_core_data/61/462/p_forams.csv taxa planktic_foraminfera \n", - "4 NOAA/DSDP_core_data/61/462/hr_desc.csv hard_rock NaN \n", + " path type taxon_group \\\n", + "0 NOAA/DSDP_core_data/61/462/radiolar.csv taxa radiolarians \n", + "1 NOAA/DSDP_core_data/61/462/ageprof.csv age NaN \n", + "2 NOAA/DSDP_core_data/61/462/b_forams.csv taxa benthic_forams \n", + "3 NOAA/DSDP_core_data/61/462/p_forams.csv taxa planktic_forams \n", + "4 NOAA/DSDP_core_data/61/462/hr_desc.csv hard_rock NaN \n", "\n", " expedition site \n", "0 61 462 \n", @@ -186,7 +188,7 @@ "4 61 462 " ] }, - "execution_count": 3, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -206,7 +208,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 29, "id": "5fafbba0", "metadata": {}, "outputs": [ @@ -216,7 +218,7 @@ "9933" ] }, - "execution_count": 4, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -239,7 +241,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 30, "id": "09358e98", "metadata": {}, "outputs": [ @@ -247,13 +249,13 @@ "data": { "text/plain": [ "[nan,\n", - " 'Theocalyptra bicornis|radiolarians',\n", - " 'Zygodiscus splendens|nannofossils',\n", - " 'Glandulina sp.|benthic_foraminfera',\n", - " 'Coscinodiscus radiatus var.|diatoms']" + " 'Pseudoemiliania lacunosa (oval)|nannofossils',\n", + " 'Theocorys antiqua|radiolarians',\n", + " 'Cassigerinella eoceanica|planktic_forams',\n", + " 'Diploneis weissflogii|diatoms']" ] }, - "execution_count": 5, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -264,7 +266,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 31, "id": "727db0d2", "metadata": {}, "outputs": [ @@ -274,7 +276,7 @@ "9932" ] }, - "execution_count": 6, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -307,7 +309,42 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 32, + "id": "c78cb092-b847-4f6f-9e69-43ee6d9bb312", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "[{'verbatim_name': 'Pseudoemiliania lacunosa (oval)',\n", + " 'taxon_group': 'nannofossils',\n", + " 'genus name': 'Pseudoemiliania',\n", + " 'simplified_name': 'Pseudoemiliania lacunosa',\n", + " 'species name': 'lacunosa'},\n", + " {'verbatim_name': 'Theocorys antiqua',\n", + " 'taxon_group': 'radiolarians',\n", + " 'genus name': 'Theocorys',\n", + " 'simplified_name': 'Theocorys antiqua',\n", + " 'species name': 'antiqua'},\n", + " {'verbatim_name': 'Cassigerinella eoceanica',\n", + " 'taxon_group': 'planktic_forams',\n", + " 'genus name': 'Cassigerinella',\n", + " 'simplified_name': 'Cassigerinella eoceanica',\n", + " 'species name': 'eoceanica'}]" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "taxa_list[0:3]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, "id": "735288ce", "metadata": {}, "outputs": [ @@ -349,7 +386,7 @@ " \n", " \n", " \n", - " 6496\n", + " 6695\n", " Abas wittii\n", " diatoms\n", " Abas\n", @@ -358,36 +395,36 @@ " NaN\n", " \n", " \n", - " 8183\n", + " 7060\n", " Abathomphalus intermedius\n", - " planktic_foraminfera\n", + " planktic_forams\n", " Abathomphalus\n", " Abathomphalus intermedius\n", " intermedius\n", " NaN\n", " \n", " \n", - " 6550\n", + " 7918\n", " Abathomphalus mayaroensis\n", - " planktic_foraminfera\n", + " planktic_forams\n", " Abathomphalus\n", " Abathomphalus mayaroensis\n", " mayaroensis\n", " NaN\n", " \n", " \n", - " 8864\n", + " 9844\n", " Abies sp.\n", - " pollen\n", + " Pollen and Spores\n", " Abies\n", " Abies sp.\n", " sp.\n", " NaN\n", " \n", " \n", - " 6898\n", + " 822\n", " Abutilon sp. (q)\n", - " pollen\n", + " Pollen and Spores\n", " Abutilon\n", " Abutilon sp.\n", " sp.\n", @@ -398,22 +435,22 @@ "" ], "text/plain": [ - " verbatim_name taxon_group genus name \\\n", - "6496 Abas wittii diatoms Abas \n", - "8183 Abathomphalus intermedius planktic_foraminfera Abathomphalus \n", - "6550 Abathomphalus mayaroensis planktic_foraminfera Abathomphalus \n", - "8864 Abies sp. pollen Abies \n", - "6898 Abutilon sp. (q) pollen Abutilon \n", + " verbatim_name taxon_group genus name \\\n", + "6695 Abas wittii diatoms Abas \n", + "7060 Abathomphalus intermedius planktic_forams Abathomphalus \n", + "7918 Abathomphalus mayaroensis planktic_forams Abathomphalus \n", + "9844 Abies sp. Pollen and Spores Abies \n", + "822 Abutilon sp. (q) Pollen and Spores Abutilon \n", "\n", " simplified_name species name subspecies name \n", - "6496 Abas wittii wittii NaN \n", - "8183 Abathomphalus intermedius intermedius NaN \n", - "6550 Abathomphalus mayaroensis mayaroensis NaN \n", - "8864 Abies sp. sp. NaN \n", - "6898 Abutilon sp. sp. NaN " + "6695 Abas wittii wittii NaN \n", + "7060 Abathomphalus intermedius intermedius NaN \n", + "7918 Abathomphalus mayaroensis mayaroensis NaN \n", + "9844 Abies sp. sp. NaN \n", + "822 Abutilon sp. sp. NaN " ] }, - "execution_count": 7, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -423,10 +460,23 @@ "log_df(noaa_taxa_df)" ] }, + { + "cell_type": "code", + "execution_count": 34, + "id": "0d99035e-8806-4fd3-982d-f8dfbbc395e8", + "metadata": {}, + "outputs": [], + "source": [ + "noaa_taxa_df.to_csv(all_taxa_path, index=False)" + ] + }, { "cell_type": "markdown", "id": "687918ba", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ "## compare and replace taxon groups \n", "\n", @@ -435,7 +485,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "id": "d24ea9d7", "metadata": {}, "outputs": [ @@ -591,7 +641,7 @@ "4 benthic_forams " ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -604,7 +654,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "id": "3fc24548", "metadata": {}, "outputs": [ @@ -625,7 +675,7 @@ " 'silicoflagellates']" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -638,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "id": "40b9838f", "metadata": {}, "outputs": [ @@ -658,7 +708,7 @@ " 'silicoflagellates']" ] }, - "execution_count": 10, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -671,7 +721,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "id": "143b5574", "metadata": {}, "outputs": [ @@ -681,7 +731,7 @@ "{'benthic_foraminfera', 'phytoliths', 'planktic_foraminfera', 'pollen'}" ] }, - "execution_count": 11, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -692,7 +742,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "id": "19e4fd48", "metadata": {}, "outputs": [ @@ -727,7 +777,7 @@ " \n", " \n", " \n", - " 6496\n", + " 6732\n", " Abas wittii\n", " diatoms\n", " Abas\n", @@ -736,7 +786,7 @@ " NaN\n", " \n", " \n", - " 8183\n", + " 5941\n", " Abathomphalus intermedius\n", " planktic_forams\n", " Abathomphalus\n", @@ -745,7 +795,7 @@ " NaN\n", " \n", " \n", - " 6550\n", + " 7525\n", " Abathomphalus mayaroensis\n", " planktic_forams\n", " Abathomphalus\n", @@ -754,7 +804,7 @@ " NaN\n", " \n", " \n", - " 8864\n", + " 564\n", " Abies sp.\n", " pollen\n", " Abies\n", @@ -763,7 +813,7 @@ " NaN\n", " \n", " \n", - " 6898\n", + " 697\n", " Abutilon sp. (q)\n", " pollen\n", " Abutilon\n", @@ -777,21 +827,21 @@ ], "text/plain": [ " verbatim_name taxon_group genus name \\\n", - "6496 Abas wittii diatoms Abas \n", - "8183 Abathomphalus intermedius planktic_forams Abathomphalus \n", - "6550 Abathomphalus mayaroensis planktic_forams Abathomphalus \n", - "8864 Abies sp. pollen Abies \n", - "6898 Abutilon sp. (q) pollen Abutilon \n", + "6732 Abas wittii diatoms Abas \n", + "5941 Abathomphalus intermedius planktic_forams Abathomphalus \n", + "7525 Abathomphalus mayaroensis planktic_forams Abathomphalus \n", + "564 Abies sp. pollen Abies \n", + "697 Abutilon sp. (q) pollen Abutilon \n", "\n", " simplified_name species name subspecies name \n", - "6496 Abas wittii wittii NaN \n", - "8183 Abathomphalus intermedius intermedius NaN \n", - "6550 Abathomphalus mayaroensis mayaroensis NaN \n", - "8864 Abies sp. sp. NaN \n", - "6898 Abutilon sp. sp. NaN " + "6732 Abas wittii wittii NaN \n", + "5941 Abathomphalus intermedius intermedius NaN \n", + "7525 Abathomphalus mayaroensis mayaroensis NaN \n", + "564 Abies sp. sp. NaN \n", + "697 Abutilon sp. sp. NaN " ] }, - "execution_count": 12, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -804,7 +854,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "id": "218ca553", "metadata": {}, "outputs": [], @@ -830,7 +880,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 35, "id": "40bc04a0", "metadata": {}, "outputs": [ @@ -901,7 +951,7 @@ " \n", " 3\n", " Abies sp.\n", - " pollen\n", + " Pollen and Spores\n", " Abies\n", " Abies sp.\n", " sp.\n", @@ -910,7 +960,7 @@ " \n", " 4\n", " Abutilon sp. (q)\n", - " pollen\n", + " Pollen and Spores\n", " Abutilon\n", " Abutilon sp.\n", " sp.\n", @@ -921,12 +971,12 @@ "" ], "text/plain": [ - " verbatim_name taxon_group genus name \\\n", - "0 Abas wittii diatoms Abas \n", - "1 Abathomphalus intermedius planktic_forams Abathomphalus \n", - "2 Abathomphalus mayaroensis planktic_forams Abathomphalus \n", - "3 Abies sp. pollen Abies \n", - "4 Abutilon sp. (q) pollen Abutilon \n", + " verbatim_name taxon_group genus name \\\n", + "0 Abas wittii diatoms Abas \n", + "1 Abathomphalus intermedius planktic_forams Abathomphalus \n", + "2 Abathomphalus mayaroensis planktic_forams Abathomphalus \n", + "3 Abies sp. Pollen and Spores Abies \n", + "4 Abutilon sp. (q) Pollen and Spores Abutilon \n", "\n", " simplified_name species name subspecies name \n", "0 Abas wittii wittii NaN \n", @@ -936,7 +986,7 @@ "4 Abutilon sp. sp. NaN " ] }, - "execution_count": 15, + "execution_count": 35, "metadata": {}, "output_type": "execute_result" } @@ -958,7 +1008,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 17, "id": "d2a9e14a", "metadata": {}, "outputs": [ @@ -1114,7 +1164,7 @@ "4 benthic_forams " ] }, - "execution_count": 16, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } @@ -1135,7 +1185,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 18, "id": "6a2b5db3", "metadata": {}, "outputs": [], @@ -1145,7 +1195,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 19, "id": "e44db772", "metadata": {}, "outputs": [], @@ -1157,7 +1207,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 20, "id": "acac5b88", "metadata": {}, "outputs": [ @@ -1249,7 +1299,7 @@ "201 Bolivina cf. crenulata " ] }, - "execution_count": 19, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -1260,7 +1310,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 21, "id": "06864474", "metadata": {}, "outputs": [ @@ -1335,7 +1385,7 @@ "4 benthic_forams Ostracoda indet." ] }, - "execution_count": 20, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -1355,7 +1405,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 22, "id": "c8f9cf39", "metadata": {}, "outputs": [ @@ -1467,7 +1517,7 @@ "4 Abutilon sp. sp. NaN left_only " ] }, - "execution_count": 21, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -1484,7 +1534,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 23, "id": "820dbdc2", "metadata": {}, "outputs": [ @@ -1596,7 +1646,7 @@ "4 Abutilon sp. sp. NaN left_only " ] }, - "execution_count": 22, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1609,7 +1659,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 24, "id": "1a573f5f", "metadata": {}, "outputs": [], @@ -1617,306 +1667,6 @@ "merged_df.to_csv(merged_path, index=False)\n" ] }, - { - "cell_type": "markdown", - "id": "28f17cf4", - "metadata": {}, - "source": [ - "### compare merge methods\n", - "do merge on simplified_name without taxon groups" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "13728bf2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(10114, 8)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
verbatim_nametaxon_group_xgenus namesimplified_namespecies namesubspecies nametaxon_group_y_merge_approved
0Abas wittiidiatomsAbasAbas wittiiwittiiNaNNaNleft_only
1Abathomphalus intermediusplanktic_foramsAbathomphalusAbathomphalus intermediusintermediusNaNNaNleft_only
2Abathomphalus mayaroensisplanktic_foramsAbathomphalusAbathomphalus mayaroensismayaroensisNaNplanktic_foramsboth
3Abies sp.pollenAbiesAbies sp.sp.NaNNaNleft_only
4Abutilon sp. (q)pollenAbutilonAbutilon sp.sp.NaNNaNleft_only
\n", - "
" - ], - "text/plain": [ - " verbatim_name taxon_group_x genus name \\\n", - "0 Abas wittii diatoms Abas \n", - "1 Abathomphalus intermedius planktic_forams Abathomphalus \n", - "2 Abathomphalus mayaroensis planktic_forams Abathomphalus \n", - "3 Abies sp. pollen Abies \n", - "4 Abutilon sp. (q) pollen Abutilon \n", - "\n", - " simplified_name species name subspecies name taxon_group_y \\\n", - "0 Abas wittii wittii NaN NaN \n", - "1 Abathomphalus intermedius intermedius NaN NaN \n", - "2 Abathomphalus mayaroensis mayaroensis NaN planktic_forams \n", - "3 Abies sp. sp. NaN NaN \n", - "4 Abutilon sp. sp. NaN NaN \n", - "\n", - " _merge_approved \n", - "0 left_only \n", - "1 left_only \n", - "2 both \n", - "3 left_only \n", - "4 left_only " - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 10114 \n", - "merged2_df = pd.merge(noaa_taxa_df, LIMS_taxa_df, \n", - " on=['simplified_name'], \n", - " how='left',\n", - " indicator='_merge_approved')\n", - "\n", - "log_df(merged2_df)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "d20ea96a", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(9937, 8)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
verbatim_nametaxon_group_xgenus namesimplified_namespecies namesubspecies nametaxon_group_y_merge_approved
0Abas wittiidiatomsAbasAbas wittiiwittiiNaNNaNleft_only
1Abathomphalus intermediusplanktic_foramsAbathomphalusAbathomphalus intermediusintermediusNaNNaNleft_only
2Abathomphalus mayaroensisplanktic_foramsAbathomphalusAbathomphalus mayaroensismayaroensisNaNplanktic_foramsboth
3Abies sp.pollenAbiesAbies sp.sp.NaNNaNleft_only
4Abutilon sp. (q)pollenAbutilonAbutilon sp.sp.NaNNaNleft_only
\n", - "
" - ], - "text/plain": [ - " verbatim_name taxon_group_x genus name \\\n", - "0 Abas wittii diatoms Abas \n", - "1 Abathomphalus intermedius planktic_forams Abathomphalus \n", - "2 Abathomphalus mayaroensis planktic_forams Abathomphalus \n", - "3 Abies sp. pollen Abies \n", - "4 Abutilon sp. (q) pollen Abutilon \n", - "\n", - " simplified_name species name subspecies name taxon_group_y \\\n", - "0 Abas wittii wittii NaN NaN \n", - "1 Abathomphalus intermedius intermedius NaN NaN \n", - "2 Abathomphalus mayaroensis mayaroensis NaN planktic_forams \n", - "3 Abies sp. sp. NaN NaN \n", - "4 Abutilon sp. sp. NaN NaN \n", - "\n", - " _merge_approved \n", - "0 left_only \n", - "1 left_only \n", - "2 both \n", - "3 left_only \n", - "4 left_only " - ] - }, - "execution_count": 25, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 9937\n", - "merged2_df = merged2_df.drop_duplicates()\n", - "log_df(merged2_df)" - ] - }, - { - "cell_type": "markdown", - "id": "e86c04a9", - "metadata": {}, - "source": [ - "the reason for the count difference is because the LIMS taxa list sometimes puts a taxa in two groups\n", - "\n", - "NOAA: Selenopemphix nephroides - dinoflagellates \n", - "LIMS: Selenopemphix nephroides - dinoflagellates, palynology" - ] - }, { "cell_type": "markdown", "id": "469c0622", @@ -1935,7 +1685,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 25, "id": "08e1a3e5", "metadata": {}, "outputs": [ @@ -2047,7 +1797,7 @@ "4 Abutilon sp. sp. NaN left_only " ] }, - "execution_count": 26, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -2059,7 +1809,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 26, "id": "431dc1fc", "metadata": {}, "outputs": [ @@ -2171,7 +1921,7 @@ "5 Abyssamina incisa incisa NaN left_only " ] }, - "execution_count": 27, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -2193,7 +1943,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 27, "id": "c952bf43", "metadata": {}, "outputs": [], diff --git a/notebooks/NOAA/24_101-210_create_taxalist.ipynb b/notebooks/NOAA/24_101-210_create_taxalist.ipynb index 05e67744c..3739a23a1 100644 --- a/notebooks/NOAA/24_101-210_create_taxalist.ipynb +++ b/notebooks/NOAA/24_101-210_create_taxalist.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 199, + "execution_count": 1, "id": "ddb3c36b", "metadata": {}, "outputs": [], @@ -24,41 +24,54 @@ "import os\n", "import requests\n", "import re\n", + "import shutil\n", + "\n", + "sys.path.append('../../')\n", + "\n", + "from config import CLEAN_DATA_DIR, OUTPUT_DIR, RAW_DATA_DIR\n", + "\n", "\n", - "sys.path.append('../scripts/')\n", - "sys.path.append('../')\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import db \n", - "import normalize_taxa as nt\n", + "import scripts.normalize_taxa as nt\n", "from scripts.shared_utils import (\n", " log_df\n", - ")" + ")\n", + "import scripts.pbdb as pbdb \n" ] }, { "cell_type": "code", - "execution_count": 200, + "execution_count": 2, "id": "f214ae66", "metadata": {}, "outputs": [], "source": [ - "base_directory = 'cleaned_data'\n", - "date='2021-07-28'\n", - "metadata_path = os.path.join(base_directory, 'metadata', 'NOAA', 'noaa_janus_iodp_files.csv')\n", - "approved_taxa_path = os.path.join(base_directory, 'taxa', 'LIMS', f'taxa_list_{date}.csv')\n", - "noaa_1_96_taxa_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_crosswalk_{date}.csv')\n", - "noaa_1_96_genus_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'genus_{date}.csv')\n", + "# base_directory = 'cleaned_data'\n", + "date='2022-11-15'\n", + "\n", + "lims_taxa_path = OUTPUT_DIR/'taxa'/'LIMS'/f'taxa_list_{date}.csv'\n", + "lims_crosswalk_path = OUTPUT_DIR/'taxa'/'LIMS'/f'taxa_crosswalk_{date}.csv'\n", + "lims_genus_path = OUTPUT_DIR/'taxa'/'LIMS'/f'genera_pbdb_{date}.csv'\n", + "\n", + "\n", + "date='2022-11-18'\n", + "\n", + "metadata_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_janus_iodp_files.csv'\n", + "noaa_101_210_taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_101_210_{date}.csv'\n", + "unapproved_taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_unapproved_101_210_{date}.csv'\n", + "species_path = OUTPUT_DIR/'taxa'/'NOAA'/f'species_101_210_{date}.csv'\n", + "genus_path = OUTPUT_DIR/'taxa'/'NOAA'/f'genus_101_210_{date}.csv'\n", + "higher_path = OUTPUT_DIR/'taxa'/'NOAA'/f'higher_taxa_101_210_{date}.csv'\n", + "\n", "\n", - "date='2021-08-03'\n", - "crosswalk_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_crosswalk_{date}.csv')\n", - "merged_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_merged_{date}.csv')\n", - "merged2_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_merged2_{date}.csv')\n", + "metadata_1_96_path = OUTPUT_DIR/'metadata'/'NOAA'/'noaa_dsdp_files.csv'\n", + "noaa_1_96_taxa_path = OUTPUT_DIR/'taxa'/'draft'/'NOAA'/f'taxa_1_96_{date}.csv'\n", "\n", - "taxa_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_list_{date}.csv')\n", - "genus_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA', f'genus_101_210_{date}.csv')\n", - "taxa_pbdb_path = os.path.join(base_directory, 'taxa', 'draft', 'NOAA',f'taxa_101_210_list_pbdb_{date}.csv')\n" + "PI_noaa_1_96_taxa_path = RAW_DATA_DIR/'PI_processed_files'/'NOAA_taxa_lists_taxa_list_2022-11-15.csv'\n", + "\n" ] }, { @@ -71,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 202, + "execution_count": 3, "id": "f5197e6e", "metadata": {}, "outputs": [ @@ -112,66 +125,66 @@ " \n", " \n", " \n", - " 436\n", - " cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...\n", + " 0\n", + " NOAA/JanusIODP_paleo_agemodel/paleontology/ran...\n", " taxa\n", - " 135\n", - " 835\n", - " nannofossils\n", + " 101\n", + " 626\n", + " benthic_forams\n", " \n", " \n", - " 437\n", - " cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...\n", + " 1\n", + " NOAA/JanusIODP_paleo_agemodel/paleontology/ran...\n", " taxa\n", - " 135\n", - " 834\n", - " nannofossils\n", + " 101\n", + " 626\n", + " benthic_forams\n", " \n", " \n", - " 438\n", - " cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...\n", + " 2\n", + " NOAA/JanusIODP_paleo_agemodel/paleontology/ran...\n", " taxa\n", - " 135\n", - " 834\n", + " 101\n", + " 626\n", " nannofossils\n", " \n", " \n", - " 439\n", - " cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...\n", + " 3\n", + " NOAA/JanusIODP_paleo_agemodel/paleontology/ran...\n", " taxa\n", - " 135\n", - " 841\n", + " 101\n", + " 626\n", " nannofossils\n", " \n", " \n", - " 440\n", - " cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel...\n", + " 4\n", + " NOAA/JanusIODP_paleo_agemodel/paleontology/ran...\n", " taxa\n", - " 135\n", - " 841\n", - " benthic_foraminfera\n", + " 101\n", + " 626\n", + " planktic_forams\n", " \n", " \n", "\n", "" ], "text/plain": [ - " path type expedition \\\n", - "436 cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel... taxa 135 \n", - "437 cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel... taxa 135 \n", - "438 cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel... taxa 135 \n", - "439 cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel... taxa 135 \n", - "440 cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel... taxa 135 \n", - "\n", - " site taxon_group \n", - "436 835 nannofossils \n", - "437 834 nannofossils \n", - "438 834 nannofossils \n", - "439 841 nannofossils \n", - "440 841 benthic_foraminfera " + " path type expedition site \\\n", + "0 NOAA/JanusIODP_paleo_agemodel/paleontology/ran... taxa 101 626 \n", + "1 NOAA/JanusIODP_paleo_agemodel/paleontology/ran... taxa 101 626 \n", + "2 NOAA/JanusIODP_paleo_agemodel/paleontology/ran... taxa 101 626 \n", + "3 NOAA/JanusIODP_paleo_agemodel/paleontology/ran... taxa 101 626 \n", + "4 NOAA/JanusIODP_paleo_agemodel/paleontology/ran... taxa 101 626 \n", + "\n", + " taxon_group \n", + "0 benthic_forams \n", + "1 benthic_forams \n", + "2 nannofossils \n", + "3 nannofossils \n", + "4 planktic_forams " ] }, - "execution_count": 202, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -184,210 +197,15 @@ }, { "cell_type": "code", - "execution_count": 203, - "id": "e26d41fd", - "metadata": {}, - "outputs": [], - "source": [ - "common_fields = {\n", - " 'Data',\n", - " 'Age From (oldest)',\n", - " 'Age To (youngest)',\n", - " 'Zone From (bottom)',\n", - " 'Zone To (top)',\n", - " 'Leg',\n", - " 'Site',\n", - " 'H',\n", - " 'Cor',\n", - " 'T',\n", - " 'Sc',\n", - " 'Top(cm)',\n", - " 'Depth (mbsf)',\n", - " 'Scientist',\n", - " 'Fossil Group',\n", - " 'Comment', \n", - " 'Group Abundance',\n", - " 'Group Preservation',\n", - "}\n" - ] - }, - { - "cell_type": "code", - "execution_count": 204, - "id": "a7c6d8e5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'Inoceramus (prisms)'}" - ] - }, - "execution_count": 204, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "path = 'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/104/643/HOLE_A/Miscellaneous.csv'\n", - "df = pd.read_csv(path)\n", - "df.dropna(axis=0, inplace=True, how='all')\n", - "file_taxa = set([col.strip() for col in df.columns]) - common_fields\n", - "file_taxa\n", - "\n", - "\n", - "# taxa = [col.strip() + '|' + row['taxon_group'] for col in file_taxa if col is not None]\n", - "# taxa[0:5]" - ] - }, - { - "cell_type": "markdown", - "id": "892cc5db", - "metadata": {}, - "source": [ - "read all the taxa files to get unique taxa names" - ] - }, - { - "cell_type": "code", - "execution_count": 219, - "id": "26a02e45", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "13285" - ] - }, - "execution_count": 219, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 13285\n", - "taxa = set()\n", - "\n", - "skip_files = [\n", - " # needs fixing - Z, X; manually remove quotes\n", - " 'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/175/1077/HOLE_A/Diatoms.csv', \n", - " # needs fixing - Reticulofenestra Z \n", - " 'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_D/Nannofossils.csv',\n", - " # needs fixing - Form A, Form B \n", - " 'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1120/HOLE_B/Radiolarians.csv',\n", - " # needs fixing - Form A \n", - " 'cleaned_data/NOAA_csv/JanusIODP_paleo_agemodel/paleontology/range_tables/181/1119/HOLE_B/Radiolarians.csv'\n", - "]\n", - "\n", - "for index, row in metadata.iterrows():\n", - " path = row['path']\n", - " if path in skip_files:\n", - " continue\n", - " \n", - " df = pd.read_csv(path)\n", - " df.dropna(axis='index', inplace=True, how='all')\n", - " \n", - " file_taxa = set([col.strip() for col in df.columns]) - common_fields\n", - " temp_taxa = [taxon.strip() + '|' + row['taxon_group'] for taxon in file_taxa if isinstance(taxon, str)] \n", - " taxa.update(temp_taxa)\n", - " \n", - "len(taxa)" - ] - }, - { - "cell_type": "code", - "execution_count": 220, - "id": "c45ac7c7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Odontochitina spp.|dinoflagellates/acritarchs/prasinophytes',\n", - " 'Enneadocysta harrisii|nannofossils',\n", - " 'Tenuitella iota|planktic_foraminfera',\n", - " 'Stereisporites taxa|pollen',\n", - " 'Globorotalia inflata (4 chambered)|nannofossils']" - ] - }, - "execution_count": 220, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "list(taxa)[20:25]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6e1f5ce2", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 224, - "id": "ef6d4217", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "13285" - ] - }, - "execution_count": 224, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 13285\n", - "taxa_list = []\n", - "\n", - "for taxon in taxa:\n", - " if not pd.isna(taxon):\n", - " taxon_name, taxon_group = taxon.split('|')\n", - " simplified_name = re.sub('\\(.*?\\)$', '', taxon_name).strip()\n", - " taxon_name_parts = simplified_name.split(' ')\n", - "\n", - " data = {'verbatim_name': taxon_name, \n", - " 'taxon_group': taxon_group, \n", - " 'genus name': taxon_name_parts[0],\n", - " 'simplified_name': simplified_name}\n", - " if len(taxon_name_parts) > 1:\n", - " data['species name'] = taxon_name_parts[1]\n", - " if len(taxon_name_parts) == 3:\n", - " data['subspecies name'] = taxon_name_parts[2]\n", - "\n", - " taxa_list.append(data)\n", - " \n", - "len(taxa_list)" - ] - }, - { - "cell_type": "markdown", - "id": "005e05ac", - "metadata": {}, - "source": [ - "create taxa list csv" - ] - }, - { - "cell_type": "code", - "execution_count": 225, - "id": "441130e0", + "execution_count": 4, + "id": "a27fd00b-91cf-4819-aacb-8cc4a1669ab0", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(13285, 6)\n" + "(13066, 10)\n" ] }, { @@ -411,241 +229,153 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", + " verbatim_name\n", + " name\n", + " genus modifier\n", " genus name\n", - " simplified_name\n", + " species modifier\n", " species name\n", + " subspecies modifier\n", " subspecies name\n", + " non-taxa descriptor\n", " \n", " \n", " \n", " \n", - " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", - " NaN\n", + " 9761\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Labyrinthodinium sp. 1\n", + " \n", + " ?\n", + " Labyrinthodinium\n", + " <NA>\n", + " sp.\n", + " <NA>\n", + " 1\n", + " <NA>\n", " \n", " \n", - " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_foraminfera\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", - " NaN\n", + " 5657\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Maduradinium sp.\n", + " \n", + " ?\n", + " Maduradinium\n", + " <NA>\n", + " sp.\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", - " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", - " NaN\n", + " 2305\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Pyxidiella sp. 1\n", + " \n", + " ?\n", + " Pyxidiella\n", + " <NA>\n", + " sp.\n", + " <NA>\n", + " 1\n", + " <NA>\n", " \n", " \n", - " 3\n", - " Globanomalina planocompressa\n", - " planktic_foraminfera\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", - " NaN\n", + " 1302\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella ivoirensis\n", + " \n", + " <NA>\n", + " Aandalusiella\n", + " <NA>\n", + " ivoirensis\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", - " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", - " NaN\n", + " 8448\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium cardioforme\n", + " \n", + " <NA>\n", + " Abratopdinium\n", + " <NA>\n", + " cardioforme\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_foraminfera \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_foraminfera \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " + " taxon_group verbatim_name \\\n", + "9761 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 \n", + "5657 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. \n", + "2305 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 \n", + "1302 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis \n", + "8448 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme \n", + "\n", + " name genus modifier genus name species modifier species name \\\n", + "9761 ? Labyrinthodinium sp. \n", + "5657 ? Maduradinium sp. \n", + "2305 ? Pyxidiella sp. \n", + "1302 Aandalusiella ivoirensis \n", + "8448 Abratopdinium cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor \n", + "9761 1 \n", + "5657 \n", + "2305 1 \n", + "1302 \n", + "8448 " ] }, - "execution_count": 225, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "taxa_df = pd.DataFrame(taxa_list)\n", - "log_df(taxa_df)" + "taxa_df = nt.create_noaa_2_taxa_crosswalk_df(metadata, CLEAN_DATA_DIR)\n", + "log_df(taxa_df)\n", + "# 13066" ] }, { "cell_type": "code", - "execution_count": 226, + "execution_count": 5, "id": "168b0b36", "metadata": {}, "outputs": [], "source": [ - "taxa_df.to_csv(crosswalk_path, index=False)" + "taxa_df.to_csv(noaa_101_210_taxa_path, index=False)" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "631f8561", - "metadata": {}, - "outputs": [], - "source": [] - }, { "cell_type": "markdown", - "id": "4a2fbee7", - "metadata": {}, - "source": [ - "## compare and replace taxon groups " - ] - }, - { - "cell_type": "code", - "execution_count": 227, - "id": "b08e32df", - "metadata": {}, - "outputs": [], - "source": [ - "noaa_taxa_df = pd.read_csv(crosswalk_path)\n", - "approved_taxa_df = pd.read_csv(approved_taxa_path)" - ] - }, - { - "cell_type": "code", - "execution_count": 228, - "id": "ec7b5427", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['benthic_forams',\n", - " 'bolboformids',\n", - " 'chrysophyte_cysts',\n", - " 'diatoms',\n", - " 'dinoflagellates',\n", - " 'ebridians',\n", - " 'nannofossils',\n", - " 'ostracods',\n", - " 'palynology',\n", - " 'planktic_forams',\n", - " 'radiolarians',\n", - " 'silicoflagellates']" - ] - }, - "execution_count": 228, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "approved_groups = list(approved_taxa_df['taxon_group'].unique())\n", - "approved_groups.sort()\n", - "approved_groups" - ] - }, - { - "cell_type": "code", - "execution_count": 229, - "id": "4cd76d19", + "id": "7f09599a", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['benthic_foraminfera',\n", - " 'bolboformids',\n", - " 'diatoms',\n", - " 'dinoflagellates/acritarchs/prasinophytes',\n", - " 'macrofossils',\n", - " 'miscellaneous',\n", - " 'nannofossils',\n", - " 'ostracods',\n", - " 'planktic_foraminfera',\n", - " 'pollen',\n", - " 'pteropods',\n", - " 'radiolarians',\n", - " 'silicoflagellates/ebridians/actiniscidians',\n", - " 'sponge_spicules',\n", - " 'trace_fossils']" - ] - }, - "execution_count": 229, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "noaa_groups = list(noaa_taxa_df['taxon_group'].unique())\n", - "noaa_groups.sort()\n", - "noaa_groups" + "## create noaa 1 taxa" ] }, { "cell_type": "code", - "execution_count": 230, - "id": "adca85f1", + "execution_count": 5, + "id": "0510c3f1-0faa-43cc-b183-d0569e0f9895", "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "{'benthic_foraminfera',\n", - " 'dinoflagellates/acritarchs/prasinophytes',\n", - " 'macrofossils',\n", - " 'miscellaneous',\n", - " 'planktic_foraminfera',\n", - " 'pollen',\n", - " 'pteropods',\n", - " 'silicoflagellates/ebridians/actiniscidians',\n", - " 'sponge_spicules',\n", - " 'trace_fossils'}" - ] - }, - "execution_count": 230, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "set(noaa_groups) - set(approved_groups)" - ] - }, - { - "cell_type": "code", - "execution_count": 231, - "id": "a3aa6f12", - "metadata": {}, - "outputs": [ + "name": "stdout", + "output_type": "stream", + "text": [ + "(2093, 5)\n" + ] + }, { "data": { "text/html": [ @@ -667,127 +397,96 @@ " \n", " \n", " \n", - " verbatim_name\n", + " path\n", + " type\n", " taxon_group\n", - " genus name\n", - " simplified_name\n", - " species name\n", - " subspecies name\n", + " expedition\n", + " site\n", " \n", " \n", " \n", " \n", " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", - " NaN\n", - " \n", - " \n", - " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_forams\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", - " NaN\n", + " NOAA/DSDP_core_data/61/462/radiolar.csv\n", + " taxa\n", + " radiolarians\n", + " 61\n", + " 462\n", " \n", " \n", " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", - " NaN\n", + " NOAA/DSDP_core_data/61/462/b_forams.csv\n", + " taxa\n", + " benthic_forams\n", + " 61\n", + " 462\n", " \n", " \n", " 3\n", - " Globanomalina planocompressa\n", + " NOAA/DSDP_core_data/61/462/p_forams.csv\n", + " taxa\n", " planktic_forams\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", - " NaN\n", + " 61\n", + " 462\n", " \n", " \n", - " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", - " NaN\n", + " 5\n", + " NOAA/DSDP_core_data/61/462/nannos.csv\n", + " taxa\n", + " nannofossils\n", + " 61\n", + " 462\n", + " \n", + " \n", + " 7\n", + " NOAA/DSDP_core_data/61/462A/radiolar.csv\n", + " taxa\n", + " radiolarians\n", + " 61\n", + " 462A\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " + " path type taxon_group expedition \\\n", + "0 NOAA/DSDP_core_data/61/462/radiolar.csv taxa radiolarians 61 \n", + "2 NOAA/DSDP_core_data/61/462/b_forams.csv taxa benthic_forams 61 \n", + "3 NOAA/DSDP_core_data/61/462/p_forams.csv taxa planktic_forams 61 \n", + "5 NOAA/DSDP_core_data/61/462/nannos.csv taxa nannofossils 61 \n", + "7 NOAA/DSDP_core_data/61/462A/radiolar.csv taxa radiolarians 61 \n", + "\n", + " site \n", + "0 462 \n", + "2 462 \n", + "3 462 \n", + "5 462 \n", + "7 462A " ] }, - "execution_count": 231, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "noaa_taxa_df = noaa_taxa_df.replace(['benthic_foraminfera', 'planktic_foraminfera'],\n", - " ['benthic_forams', 'planktic_forams'])\n", - "noaa_taxa_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 232, - "id": "cacd2fdc", - "metadata": {}, - "outputs": [], - "source": [ - "noaa_taxa_df.to_csv(crosswalk_path, index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "7f09599a", - "metadata": {}, - "source": [ - "## combine noaa taxa" + "metadata_1 = pd.read_csv(metadata_1_96_path)\n", + "metadata_1 = metadata_1[metadata_1['type'] == 'taxa']\n", + "log_df(metadata_1)" ] }, { "cell_type": "code", - "execution_count": 233, - "id": "92a3fd52", + "execution_count": 6, + "id": "3bf4b0d2-2598-4afe-a2df-a2650052ea59", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(9932, 6)\n" + "(9933, 10)\n" ] }, { @@ -811,103 +510,151 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", + " verbatim_name\n", + " name\n", + " genus modifier\n", " genus name\n", - " simplified_name\n", + " species modifier\n", " species name\n", + " subspecies modifier\n", " subspecies name\n", + " non-taxa descriptor\n", " \n", " \n", " \n", " \n", - " 0\n", - " Reticulofenestra minutulus (q)\n", - " nannofossils\n", - " Reticulofenestra\n", - " Reticulofenestra minutulus\n", - " minutulus\n", - " NaN\n", + " 2076\n", + " Ebridians and Actinicidians\n", + " Actiniscus elongatus (q)\n", + " ? Actiniscus elongatus\n", + " ?\n", + " Actiniscus\n", + " <NA>\n", + " elongatus\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", - " 1\n", - " Theocotyle alpha\n", - " radiolarians\n", - " Theocotyle\n", - " Theocotyle alpha\n", - " alpha\n", - " NaN\n", + " 471\n", + " Ebridians and Actinicidians\n", + " Actiniscus laciniatus (q)\n", + " ? Actiniscus laciniatus\n", + " ?\n", + " Actiniscus\n", + " <NA>\n", + " laciniatus\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", - " 2\n", - " Coscinodiscus kuetzingii\n", - " diatoms\n", - " Coscinodiscus\n", - " Coscinodiscus kuetzingii\n", - " kuetzingii\n", - " NaN\n", + " 2752\n", + " Ebridians and Actinicidians\n", + " Actiniscus pentasterias\n", + " \n", + " <NA>\n", + " Actiniscus\n", + " <NA>\n", + " pentasterias\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", - " 3\n", - " Fragilaria hirosakiensis\n", - " diatoms\n", - " Fragilaria\n", - " Fragilaria hirosakiensis\n", - " hirosakiensis\n", - " NaN\n", + " 5794\n", + " Ebridians and Actinicidians\n", + " Actiniscus sp.\n", + " \n", + " <NA>\n", + " Actiniscus\n", + " <NA>\n", + " sp.\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", - " 4\n", - " Globorotalia truncatulinoides (sin)\n", - " planktic_forams\n", - " Globorotalia\n", - " Globorotalia truncatulinoides\n", - " truncatulinoides\n", - " NaN\n", + " 1261\n", + " Ebridians and Actinicidians\n", + " Actiniscus squamosus\n", + " \n", + " <NA>\n", + " Actiniscus\n", + " <NA>\n", + " squamosus\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group genus name \\\n", - "0 Reticulofenestra minutulus (q) nannofossils Reticulofenestra \n", - "1 Theocotyle alpha radiolarians Theocotyle \n", - "2 Coscinodiscus kuetzingii diatoms Coscinodiscus \n", - "3 Fragilaria hirosakiensis diatoms Fragilaria \n", - "4 Globorotalia truncatulinoides (sin) planktic_forams Globorotalia \n", - "\n", - " simplified_name species name subspecies name \n", - "0 Reticulofenestra minutulus minutulus NaN \n", - "1 Theocotyle alpha alpha NaN \n", - "2 Coscinodiscus kuetzingii kuetzingii NaN \n", - "3 Fragilaria hirosakiensis hirosakiensis NaN \n", - "4 Globorotalia truncatulinoides truncatulinoides NaN " + " taxon_group verbatim_name \\\n", + "2076 Ebridians and Actinicidians Actiniscus elongatus (q) \n", + "471 Ebridians and Actinicidians Actiniscus laciniatus (q) \n", + "2752 Ebridians and Actinicidians Actiniscus pentasterias \n", + "5794 Ebridians and Actinicidians Actiniscus sp. \n", + "1261 Ebridians and Actinicidians Actiniscus squamosus \n", + "\n", + " name genus modifier genus name species modifier \\\n", + "2076 ? Actiniscus elongatus ? Actiniscus \n", + "471 ? Actiniscus laciniatus ? Actiniscus \n", + "2752 Actiniscus \n", + "5794 Actiniscus \n", + "1261 Actiniscus \n", + "\n", + " species name subspecies modifier subspecies name non-taxa descriptor \n", + "2076 elongatus \n", + "471 laciniatus \n", + "2752 pentasterias \n", + "5794 sp. \n", + "1261 squamosus " ] }, - "execution_count": 233, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# 9932\n", - "\n", - "noaa_1_taxa_df = pd.read_csv(noaa_1_96_taxa_path)\n", - "log_df(noaa_1_taxa_df)" + "taxa_1_df = nt.create_noaa_1_taxa_crosswalk_df(metadata_1, CLEAN_DATA_DIR)\n", + "log_df(taxa_1_df)\n", + "# 9933" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "8935145b-35d7-44ab-8c25-216b3d05928f", + "metadata": {}, + "outputs": [], + "source": [ + "taxa_1_df.to_csv(noaa_1_96_taxa_path, index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "e6a95bd0", + "metadata": {}, + "source": [ + "## Created taxa list with unapproved NOAA taxa" ] }, { "cell_type": "code", - "execution_count": 235, - "id": "df65b8c2", + "execution_count": 10, + "id": "a6c12077", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(13285, 6)\n" + "(13066, 10)\n" ] }, { @@ -931,58 +678,82 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", + " verbatim_name\n", + " name\n", + " genus modifier\n", " genus name\n", - " simplified_name\n", + " species modifier\n", " species name\n", + " subspecies modifier\n", " subspecies name\n", + " non-taxa descriptor\n", " \n", " \n", " \n", " \n", " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Labyrinthodinium sp. 1\n", + " NaN\n", + " ?\n", + " Labyrinthodinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", " NaN\n", " \n", " \n", " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_forams\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Maduradinium sp.\n", + " NaN\n", + " ?\n", + " Maduradinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " NaN\n", " NaN\n", " \n", " \n", " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Pyxidiella sp. 1\n", + " NaN\n", + " ?\n", + " Pyxidiella\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", " NaN\n", " \n", " \n", " 3\n", - " Globanomalina planocompressa\n", - " planktic_forams\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella ivoirensis\n", + " NaN\n", + " NaN\n", + " Aandalusiella\n", + " NaN\n", + " ivoirensis\n", + " NaN\n", + " NaN\n", " NaN\n", " \n", " \n", " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium cardioforme\n", + " NaN\n", + " NaN\n", + " Abratopdinium\n", + " NaN\n", + " cardioforme\n", + " NaN\n", + " NaN\n", " NaN\n", " \n", " \n", @@ -990,51 +761,51 @@ "" ], "text/plain": [ - " verbatim_name taxon_group \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor \n", + "0 NaN 1 NaN \n", + "1 NaN NaN NaN \n", + "2 NaN 1 NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN " ] }, - "execution_count": 235, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# 13285\n", + "noaa_2_taxa_df = pd.read_csv(noaa_101_210_taxa_path)\n", + "log_df(noaa_2_taxa_df)\n", "\n", - "noaa_taxa_df = pd.read_csv(crosswalk_path)\n", - "log_df(noaa_taxa_df)" + "# 13066" ] }, { "cell_type": "code", - "execution_count": 236, - "id": "c24748d4", + "execution_count": 11, + "id": "701b6806", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(23217, 6)\n" + "(9933, 2)\n" ] }, { @@ -1058,103 +829,72 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", - " genus name\n", - " simplified_name\n", - " species name\n", - " subspecies name\n", + " verbatim_name\n", " \n", " \n", " \n", " \n", " 0\n", - " Reticulofenestra minutulus (q)\n", - " nannofossils\n", - " Reticulofenestra\n", - " Reticulofenestra minutulus\n", - " minutulus\n", - " NaN\n", + " Ebridians and Actinicidians\n", + " Actiniscus elongatus (q)\n", " \n", " \n", " 1\n", - " Theocotyle alpha\n", - " radiolarians\n", - " Theocotyle\n", - " Theocotyle alpha\n", - " alpha\n", - " NaN\n", + " Ebridians and Actinicidians\n", + " Actiniscus laciniatus (q)\n", " \n", " \n", " 2\n", - " Coscinodiscus kuetzingii\n", - " diatoms\n", - " Coscinodiscus\n", - " Coscinodiscus kuetzingii\n", - " kuetzingii\n", - " NaN\n", + " Ebridians and Actinicidians\n", + " Actiniscus pentasterias\n", " \n", " \n", " 3\n", - " Fragilaria hirosakiensis\n", - " diatoms\n", - " Fragilaria\n", - " Fragilaria hirosakiensis\n", - " hirosakiensis\n", - " NaN\n", + " Ebridians and Actinicidians\n", + " Actiniscus sp.\n", " \n", " \n", " 4\n", - " Globorotalia truncatulinoides (sin)\n", - " planktic_forams\n", - " Globorotalia\n", - " Globorotalia truncatulinoides\n", - " truncatulinoides\n", - " NaN\n", + " Ebridians and Actinicidians\n", + " Actiniscus squamosus\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group genus name \\\n", - "0 Reticulofenestra minutulus (q) nannofossils Reticulofenestra \n", - "1 Theocotyle alpha radiolarians Theocotyle \n", - "2 Coscinodiscus kuetzingii diatoms Coscinodiscus \n", - "3 Fragilaria hirosakiensis diatoms Fragilaria \n", - "4 Globorotalia truncatulinoides (sin) planktic_forams Globorotalia \n", - "\n", - " simplified_name species name subspecies name \n", - "0 Reticulofenestra minutulus minutulus NaN \n", - "1 Theocotyle alpha alpha NaN \n", - "2 Coscinodiscus kuetzingii kuetzingii NaN \n", - "3 Fragilaria hirosakiensis hirosakiensis NaN \n", - "4 Globorotalia truncatulinoides truncatulinoides NaN " + " taxon_group verbatim_name\n", + "0 Ebridians and Actinicidians Actiniscus elongatus (q)\n", + "1 Ebridians and Actinicidians Actiniscus laciniatus (q)\n", + "2 Ebridians and Actinicidians Actiniscus pentasterias\n", + "3 Ebridians and Actinicidians Actiniscus sp.\n", + "4 Ebridians and Actinicidians Actiniscus squamosus" ] }, - "execution_count": 236, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# 23217\n", + "noaa_1_taxa_df = pd.read_csv(noaa_1_96_taxa_path, usecols=['taxon_group', 'verbatim_name'])\n", + "log_df(noaa_1_taxa_df)\n", "\n", - "combined_df = pd.concat([noaa_1_taxa_df, noaa_taxa_df])\n", - "log_df(combined_df)" + "# 9933" ] }, { "cell_type": "code", - "execution_count": 237, - "id": "6662822f", + "execution_count": 12, + "id": "a598fd40-5444-43f0-836b-f1d3fc38bb19", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(20281, 6)\n" + "(5380, 2)\n" ] }, { @@ -1178,111 +918,80 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", - " genus name\n", - " simplified_name\n", - " species name\n", - " subspecies name\n", + " verbatim_name\n", " \n", " \n", " \n", " \n", " 0\n", - " Reticulofenestra minutulus (q)\n", - " nannofossils\n", - " Reticulofenestra\n", - " Reticulofenestra minutulus\n", - " minutulus\n", - " NaN\n", + " benthic_forams\n", + " Euuvigerina miozea (group) >100 m\n", " \n", " \n", " 1\n", - " Theocotyle alpha\n", - " radiolarians\n", - " Theocotyle\n", - " Theocotyle alpha\n", - " alpha\n", - " NaN\n", + " benthic_forams\n", + " Euuvigerina rodleyi (group) >50 m\n", " \n", " \n", " 2\n", - " Coscinodiscus kuetzingii\n", - " diatoms\n", - " Coscinodiscus\n", - " Coscinodiscus kuetzingii\n", - " kuetzingii\n", - " NaN\n", + " benthic_forams\n", + " Others\n", " \n", " \n", " 3\n", - " Fragilaria hirosakiensis\n", - " diatoms\n", - " Fragilaria\n", - " Fragilaria hirosakiensis\n", - " hirosakiensis\n", - " NaN\n", + " benthic_forams\n", + " Pleurostomellids comment\n", " \n", " \n", " 4\n", - " Globorotalia truncatulinoides (sin)\n", - " planktic_forams\n", - " Globorotalia\n", - " Globorotalia truncatulinoides\n", - " truncatulinoides\n", - " NaN\n", + " benthic_forams\n", + " Ostracoda spp.\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group genus name \\\n", - "0 Reticulofenestra minutulus (q) nannofossils Reticulofenestra \n", - "1 Theocotyle alpha radiolarians Theocotyle \n", - "2 Coscinodiscus kuetzingii diatoms Coscinodiscus \n", - "3 Fragilaria hirosakiensis diatoms Fragilaria \n", - "4 Globorotalia truncatulinoides (sin) planktic_forams Globorotalia \n", - "\n", - " simplified_name species name subspecies name \n", - "0 Reticulofenestra minutulus minutulus NaN \n", - "1 Theocotyle alpha alpha NaN \n", - "2 Coscinodiscus kuetzingii kuetzingii NaN \n", - "3 Fragilaria hirosakiensis hirosakiensis NaN \n", - "4 Globorotalia truncatulinoides truncatulinoides NaN " + " taxon_group verbatim_name\n", + "0 benthic_forams Euuvigerina miozea (group) >100 m\n", + "1 benthic_forams Euuvigerina rodleyi (group) >50 m\n", + "2 benthic_forams Others\n", + "3 benthic_forams Pleurostomellids comment\n", + "4 benthic_forams Ostracoda spp." ] }, - "execution_count": 237, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# 20281\n", + "lims_taxa_df = pd.read_csv(lims_crosswalk_path, usecols=['taxon_group', 'verbatim_name'])\n", "\n", - "combined_df = combined_df.drop_duplicates()\n", - "log_df(combined_df)" + "log_df(lims_taxa_df)\n", + "# 5380" ] }, { "cell_type": "markdown", - "id": "e6a95bd0", + "id": "946ee42c-7438-41f5-af56-2c83632616f0", "metadata": {}, "source": [ - "## Compare NOAA 1-96 taxa with NOAA 101-210" + "get taxa that PIs have looked at" ] }, { "cell_type": "code", - "execution_count": 238, - "id": "701b6806", + "execution_count": 13, + "id": "994d5ca0-380a-4715-81b0-2e406c51a242", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(9932, 2)\n" + "(13588, 2)\n" ] }, { @@ -1307,69 +1016,80 @@ " \n", " \n", " taxon_group\n", - " simplified_name\n", + " verbatim_name\n", " \n", " \n", " \n", " \n", " 0\n", - " nannofossils\n", - " Reticulofenestra minutulus\n", + " Ebridians and Actinicidians\n", + " Actiniscus elongatus (q)\n", " \n", " \n", " 1\n", - " radiolarians\n", - " Theocotyle alpha\n", + " Ebridians and Actinicidians\n", + " Actiniscus laciniatus (q)\n", " \n", " \n", " 2\n", - " diatoms\n", - " Coscinodiscus kuetzingii\n", + " Ebridians and Actinicidians\n", + " Actiniscus pentasterias\n", " \n", " \n", " 3\n", - " diatoms\n", - " Fragilaria hirosakiensis\n", + " Ebridians and Actinicidians\n", + " Actiniscus sp.\n", " \n", " \n", " 4\n", - " planktic_forams\n", - " Globorotalia truncatulinoides\n", + " Ebridians and Actinicidians\n", + " Actiniscus squamosus\n", " \n", " \n", "\n", "" ], "text/plain": [ - " taxon_group simplified_name\n", - "0 nannofossils Reticulofenestra minutulus\n", - "1 radiolarians Theocotyle alpha\n", - "2 diatoms Coscinodiscus kuetzingii\n", - "3 diatoms Fragilaria hirosakiensis\n", - "4 planktic_forams Globorotalia truncatulinoides" + " taxon_group verbatim_name\n", + "0 Ebridians and Actinicidians Actiniscus elongatus (q)\n", + "1 Ebridians and Actinicidians Actiniscus laciniatus (q)\n", + "2 Ebridians and Actinicidians Actiniscus pentasterias\n", + "3 Ebridians and Actinicidians Actiniscus sp.\n", + "4 Ebridians and Actinicidians Actiniscus squamosus" ] }, - "execution_count": 238, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "noaa_1_taxa_df = pd.read_csv(noaa_1_96_taxa_path, usecols=['simplified_name', 'taxon_group'])\n", - "log_df(noaa_1_taxa_df)" + "approved_taxa_df = pd.concat([noaa_1_taxa_df, lims_taxa_df])\n", + "approved_taxa_df.drop_duplicates(inplace=True)\n", + "\n", + "log_df(approved_taxa_df)\n", + "# 13588" + ] + }, + { + "cell_type": "markdown", + "id": "edba37a5", + "metadata": {}, + "source": [ + "select NOAA taxa that needs approval" ] }, { "cell_type": "code", - "execution_count": 239, - "id": "a6c12077", + "execution_count": 14, + "id": "34e7dcec", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(13285, 6)\n" + "(13066, 11)\n" ] }, { @@ -1393,116 +1113,153 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", + " verbatim_name\n", + " name\n", + " genus modifier\n", " genus name\n", - " simplified_name\n", + " species modifier\n", " species name\n", + " subspecies modifier\n", " subspecies name\n", + " non-taxa descriptor\n", + " _merge_type\n", " \n", " \n", " \n", " \n", " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Labyrinthodinium sp. 1\n", + " NaN\n", + " ?\n", + " Labyrinthodinium\n", + " NaN\n", + " sp.\n", " NaN\n", + " 1\n", + " NaN\n", + " left_only\n", " \n", " \n", " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_forams\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Maduradinium sp.\n", + " NaN\n", + " ?\n", + " Maduradinium\n", " NaN\n", + " sp.\n", + " NaN\n", + " NaN\n", + " NaN\n", + " left_only\n", " \n", " \n", " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Pyxidiella sp. 1\n", + " NaN\n", + " ?\n", + " Pyxidiella\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", " NaN\n", + " left_only\n", " \n", " \n", " 3\n", - " Globanomalina planocompressa\n", - " planktic_forams\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella ivoirensis\n", + " NaN\n", + " NaN\n", + " Aandalusiella\n", + " NaN\n", + " ivoirensis\n", + " NaN\n", " NaN\n", + " NaN\n", + " left_only\n", " \n", " \n", " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium cardioforme\n", + " NaN\n", + " NaN\n", + " Abratopdinium\n", " NaN\n", + " cardioforme\n", + " NaN\n", + " NaN\n", + " NaN\n", + " left_only\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor _merge_type \n", + "0 NaN 1 NaN left_only \n", + "1 NaN NaN NaN left_only \n", + "2 NaN 1 NaN left_only \n", + "3 NaN NaN NaN left_only \n", + "4 NaN NaN NaN left_only " ] }, - "execution_count": 239, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "noaa_taxa_df = pd.read_csv(crosswalk_path)\n", - "log_df(noaa_taxa_df)" + "merged_df = noaa_2_taxa_df.merge(approved_taxa_df, \n", + " on=['verbatim_name', 'taxon_group' ], \n", + " how='left',\n", + " indicator='_merge_type')\n", + "\n", + "merged_df = merged_df.drop_duplicates()\n", + "\n", + "log_df(merged_df)\n", + "# 13066 " ] }, { "cell_type": "markdown", - "id": "edba37a5", + "id": "977133c2", "metadata": {}, "source": [ - "### merge NOAA taxa" + "Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved." ] }, { "cell_type": "code", - "execution_count": 241, - "id": "34e7dcec", + "execution_count": 15, + "id": "0840253d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(14140, 7)\n" + "(9024, 10)\n" ] }, { @@ -1526,119 +1283,188 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", + " verbatim_name\n", + " name\n", + " genus modifier\n", " genus name\n", - " simplified_name\n", + " species modifier\n", " species name\n", + " subspecies modifier\n", " subspecies name\n", - " _merge_approved\n", + " non-taxa descriptor\n", " \n", " \n", " \n", " \n", " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Labyrinthodinium sp. 1\n", + " NaN\n", + " ?\n", + " Labyrinthodinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", " NaN\n", - " left_only\n", " \n", " \n", " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_forams\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Maduradinium sp.\n", + " NaN\n", + " ?\n", + " Maduradinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Pyxidiella sp. 1\n", + " NaN\n", + " ?\n", + " Pyxidiella\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", " NaN\n", - " left_only\n", " \n", " \n", " 3\n", - " Globanomalina planocompressa\n", - " planktic_forams\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella ivoirensis\n", + " NaN\n", + " NaN\n", + " Aandalusiella\n", + " NaN\n", + " ivoirensis\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium cardioforme\n", + " NaN\n", + " NaN\n", + " Abratopdinium\n", + " NaN\n", + " cardioforme\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name _merge_approved \n", - "0 NaN left_only \n", - "1 NaN left_only \n", - "2 NaN left_only \n", - "3 NaN left_only \n", - "4 NaN left_only " + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor \n", + "0 NaN 1 NaN \n", + "1 NaN NaN NaN \n", + "2 NaN 1 NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN " ] }, - "execution_count": 241, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# 14140 \n", - "merged_df = pd.merge(noaa_taxa_df, noaa_1_taxa_df, \n", - " on=['simplified_name', 'taxon_group' ], \n", - " how='left',\n", - " indicator='_merge_approved')\n", + "unapproved_taxa_df = merged_df[merged_df['_merge_type'] == 'left_only'].copy()\n", + "\n", + "del unapproved_taxa_df['_merge_type']\n", + "\n", + "log_df(unapproved_taxa_df)\n", + "\n", + "# (9024, 9)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "79e68854", + "metadata": {}, + "outputs": [], + "source": [ + "unapproved_taxa_df.to_csv(unapproved_taxa_path, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c4987e2e-7e40-4aab-a619-7772112838c7", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "54bc2d22", + "metadata": {}, + "source": [ + "## create species csv" + ] + }, + { + "cell_type": "markdown", + "id": "7a5a71e7", + "metadata": {}, + "source": [ + "Look up the genus for unapproved taxa in PBDB" + ] + }, + { + "cell_type": "code", + "execution_count": 73, + "id": "d50cf1d7-708a-47fe-abc2-e3f3c9f75fa7", + "metadata": {}, + "outputs": [], + "source": [ + "def add_genus_species(taxa_df):\n", + " taxa_df.loc[~taxa_df['species name'].str.contains('spp\\.|sp\\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name'] + ' ' + taxa_df['species name']\n", + " # taxa_df.loc[taxa_df['species name'].isna(), 'genus species']=taxa_df['genus name'] \n", + " # taxa_df.loc[taxa_df['species name'].str.contains('spp\\.|sp\\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name']\n", + " taxa_df['genus species'] = taxa_df['genus species'].str.replace('(?)', '', regex=False)\n", + " taxa_df['genus species'] = taxa_df['genus species'].str.replace('?', '', regex=False)\n", "\n", - "log_df(merged_df)\n" + " taxa_df['genus species'] = taxa_df['genus species'].str.strip()" ] }, { "cell_type": "code", - "execution_count": 243, - "id": "0bfc8325", + "execution_count": 74, + "id": "6d2ffa28-7dae-4dce-97de-f153cb8733b1", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(13285, 7)\n" + "(9024, 25)\n" ] }, { @@ -1662,267 +1488,223 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", + " verbatim_name\n", + " name\n", + " genus modifier\n", " genus name\n", - " simplified_name\n", + " species modifier\n", " species name\n", + " subspecies modifier\n", " subspecies name\n", - " _merge_approved\n", + " non-taxa descriptor\n", + " ...\n", + " order_taxon_id\n", + " order_taxon_name\n", + " class_taxon_id\n", + " class_taxon_name\n", + " phylum_taxon_id\n", + " phylum_taxon_name\n", + " kingdom_taxon_id\n", + " kingdom_taxon_name\n", + " Any taxon above genus\n", + " genus species\n", " \n", " \n", " \n", " \n", " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Labyrinthodinium sp. 1\n", + " NaN\n", + " ?\n", + " Labyrinthodinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", + " NaN\n", + " ...\n", + " 321606.0\n", + " Gonyaulacales\n", + " 321578.0\n", + " Dinophyceae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_forams\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Maduradinium sp.\n", + " NaN\n", + " ?\n", + " Maduradinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " 277919.0\n", + " Peridiniales\n", + " 321578.0\n", + " Dinophyceae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Pyxidiella sp. 1\n", " NaN\n", - " left_only\n", - " \n", - " \n", - " 3\n", - " Globanomalina planocompressa\n", - " planktic_forams\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", + " ?\n", + " Pyxidiella\n", " NaN\n", - " left_only\n", - " \n", - " \n", - " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", + " sp.\n", " NaN\n", - " left_only\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " verbatim_name taxon_group \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name _merge_approved \n", - "0 NaN left_only \n", - "1 NaN left_only \n", - "2 NaN left_only \n", - "3 NaN left_only \n", - "4 NaN left_only " - ] - }, - "execution_count": 243, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# 13285\n", - "merged_df = merged_df.drop_duplicates()\n", - "log_df(merged_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 244, - "id": "e527300a", - "metadata": {}, - "outputs": [], - "source": [ - "merged_df.to_csv(merged_path, index=False)\n" - ] - }, - { - "cell_type": "code", - "execution_count": 246, - "id": "8503d4f8", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(14511, 8)\n" - ] - }, - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
verbatim_nametaxon_group_xgenus namesimplified_namespecies namesubspecies nametaxon_group_y_merge_approved
0Talimudinium scissurumdinoflagellates/acritarchs/prasinophytesTalimudiniumTalimudinium scissurumscissurum1NaN...277919.0Peridiniales321578.0DinophyceaeNaNNaNleft_only
1Pseudoclavulina rugolosabenthic_foramsPseudoclavulinaPseudoclavulina rugolosarugolosaNaNNaNleft_only
2Pentadinium goniferumnannofossilsPentadiniumPentadinium goniferumgoniferumNaNNaNleft_only
3Globanomalina planocompressaplanktic_foramsGlobanomalinaGlobanomalina planocompressaplanocompressaDinoflagellates/Acritarchs/PrasinophytesAandalusiella ivoirensisNaNNaNleft_onlyAandalusiellaNaNivoirensisNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNAandalusiella ivoirensis
4Obliquipithonella multistratadinoflagellates/acritarchs/prasinophytesObliquipithonellaObliquipithonella multistratamultistrataDinoflagellates/Acritarchs/PrasinophytesAbratopdinium cardioformeNaNNaNleft_onlyAbratopdiniumNaNcardioformeNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNAbratopdinium cardioforme
\n", + "

5 rows × 25 columns

\n", "
" ], "text/plain": [ - " verbatim_name taxon_group_x \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name taxon_group_y _merge_approved \n", - "0 NaN NaN left_only \n", - "1 NaN NaN left_only \n", - "2 NaN NaN left_only \n", - "3 NaN NaN left_only \n", - "4 NaN NaN left_only " + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor ... \\\n", + "0 NaN 1 NaN ... \n", + "1 NaN NaN NaN ... \n", + "2 NaN 1 NaN ... \n", + "3 NaN NaN NaN ... \n", + "4 NaN NaN NaN ... \n", + "\n", + " order_taxon_id order_taxon_name class_taxon_id class_taxon_name \\\n", + "0 321606.0 Gonyaulacales 321578.0 Dinophyceae \n", + "1 277919.0 Peridiniales 321578.0 Dinophyceae \n", + "2 277919.0 Peridiniales 321578.0 Dinophyceae \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " phylum_taxon_id phylum_taxon_name kingdom_taxon_id kingdom_taxon_name \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " Any taxon above genus genus species \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN Aandalusiella ivoirensis \n", + "4 NaN Abratopdinium cardioforme \n", + "\n", + "[5 rows x 25 columns]" ] }, - "execution_count": 246, + "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# 14511 \n", - "merged2_df = pd.merge(noaa_taxa_df, noaa_1_taxa_df, \n", - " on=['simplified_name'], \n", - " how='left',\n", - " indicator='_merge_approved')\n", - "\n", - "log_df(merged2_df)\n" + "df = pd.read_csv(unapproved_taxa_path)\n", + "add_genus_species(df)\n", + "log_df(df)" ] }, { "cell_type": "code", - "execution_count": 248, - "id": "ce38d4ea", + "execution_count": 75, + "id": "f687c443-0152-4a99-b52e-9a827343d78a", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(13296, 8)\n" + "(5525, 3)\n" ] }, { @@ -1946,163 +1728,166 @@ " \n", " \n", " \n", - " verbatim_name\n", - " taxon_group_x\n", - " genus name\n", - " simplified_name\n", - " species name\n", - " subspecies name\n", - " taxon_group_y\n", - " _merge_approved\n", + " taxon_group\n", + " genus species\n", + " check\n", " \n", " \n", " \n", " \n", - " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", - " NaN\n", - " NaN\n", - " left_only\n", + " 3\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella ivoirensis\n", + " False\n", " \n", " \n", - " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_forams\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", - " NaN\n", - " NaN\n", - " left_only\n", + " 4\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium cardioforme\n", + " False\n", " \n", " \n", - " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", - " NaN\n", - " NaN\n", - " left_only\n", + " 5\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium kerguelense\n", + " False\n", " \n", " \n", - " 3\n", - " Globanomalina planocompressa\n", - " planktic_forams\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", - " NaN\n", - " NaN\n", - " left_only\n", + " 6\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Acanthaulax granulata\n", + " False\n", " \n", " \n", - " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", - " NaN\n", - " NaN\n", - " left_only\n", + " 7\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Acanthaulax wilsonii\n", + " False\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group_x \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name taxon_group_y _merge_approved \n", - "0 NaN NaN left_only \n", - "1 NaN NaN left_only \n", - "2 NaN NaN left_only \n", - "3 NaN NaN left_only \n", - "4 NaN NaN left_only " + " taxon_group genus species check\n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis False\n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme False\n", + "5 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium kerguelense False\n", + "6 Dinoflagellates/Acritarchs/Prasinophytes Acanthaulax granulata False\n", + "7 Dinoflagellates/Acritarchs/Prasinophytes Acanthaulax wilsonii False" ] }, - "execution_count": 248, + "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# 13296\n", - "merged2_df = merged2_df.drop_duplicates()\n", - "log_df(merged2_df)\n" + "species_df = df[df['genus species'].notna()].copy()[['taxon_group', 'genus species']]\n", + "species_df.drop_duplicates(inplace=True)\n", + "species_df['check'] = False\n", + "\n", + "log_df(species_df)\n", + "# 5525" ] }, { - "cell_type": "markdown", - "id": "ae299665", + "cell_type": "code", + "execution_count": 90, + "id": "5624d496-6af7-465f-9003-ad6c12248164", + "metadata": {}, + "outputs": [], + "source": [ + "pbdb.fetch_pdbd_data(species_df, 'genus species')" + ] + }, + { + "cell_type": "raw", + "id": "f17c5eb3-0975-4eb3-ac77-acefcb60c8de", "metadata": {}, "source": [ - "save changes to taxa_all csv" + "log_df(species_df)" ] }, { "cell_type": "code", - "execution_count": 249, - "id": "f1d20345", + "execution_count": 91, + "id": "68baafa8-8e24-42b5-8a7f-0713515304a0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(0, 20)" + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "merged2_df.to_csv(merged2_path, index=False)" + "species_df[species_df['check'] == False].shape" ] }, { - "cell_type": "markdown", - "id": "bd60feb5", + "cell_type": "code", + "execution_count": 94, + "id": "fc5a619b-a59c-462a-8cfe-aba8702b982e", "metadata": {}, + "outputs": [], "source": [ - "the reason for the count difference is because a taxa in multiple taxon groups\n" + "# del species_df['check']\n" ] }, { - "cell_type": "markdown", - "id": "25e3a633", + "cell_type": "code", + "execution_count": 95, + "id": "f1bfa0cc-ae7c-47e2-a742-b931e9a0831c", "metadata": {}, + "outputs": [], "source": [ - "## Created taxa list with unapproved NOAA taxa" + "species_df.to_csv(species_path, index=False)" ] }, { "cell_type": "markdown", - "id": "977133c2", + "id": "5bd0579d-8ccc-48bc-ada6-aeebf436e39d", + "metadata": { + "tags": [] + }, + "source": [ + "## add species pbdb info to unapproved taxa " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "0613de74-bcbc-4306-9696-60cfde49b013", "metadata": {}, + "outputs": [], "source": [ - "Select unapproved NOAA taxa. When _merged is both, it means the taxa has been approved. When _merged is left_only, it means the taxa has not been approved." + "def add_genus_species(taxa_df):\n", + " taxa_df.loc[~taxa_df['species name'].str.contains('spp\\.|sp\\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name'] + ' ' + taxa_df['species name']\n", + " # taxa_df.loc[taxa_df['species name'].isna(), 'genus species']=taxa_df['genus name'] \n", + " # taxa_df.loc[taxa_df['species name'].str.contains('spp\\.|sp\\..*?', regex=True).fillna(False), 'genus species']=taxa_df['genus name']\n", + " taxa_df['genus species'] = taxa_df['genus species'].str.replace('(?)', '', regex=False)\n", + " taxa_df['genus species'] = taxa_df['genus species'].str.replace('?', '', regex=False)\n", + "\n", + " taxa_df['genus species'] = taxa_df['genus species'].str.strip()" ] }, { "cell_type": "code", - "execution_count": 250, - "id": "a12b7581", + "execution_count": 4, + "id": "f2796f59-f19d-424f-924d-e8f014e09bdf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(13285, 7)\n" + "(5525, 19)\n" ] }, { @@ -2126,114 +1911,209 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", - " genus name\n", - " simplified_name\n", - " species name\n", - " subspecies name\n", - " _merge_approved\n", + " genus species\n", + " pbdb_taxon_id\n", + " pbdb_taxon_name\n", + " pbdb_taxon_rank\n", + " genus_taxon_id\n", + " genus_taxon_name\n", + " family_taxon_id\n", + " family_taxon_name\n", + " order_taxon_id\n", + " order_taxon_name\n", + " class_taxon_id\n", + " class_taxon_name\n", + " phylum_taxon_id\n", + " phylum_taxon_name\n", + " kingdom_taxon_id\n", + " kingdom_taxon_name\n", + " unranked clade_taxon_id\n", + " unranked clade_taxon_name\n", " \n", " \n", " \n", " \n", " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella ivoirensis\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_forams\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium cardioforme\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium kerguelense\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 3\n", - " Globanomalina planocompressa\n", - " planktic_forams\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Acanthaulax granulata\n", " NaN\n", - " left_only\n", - " \n", - " \n", - " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", " NaN\n", - " left_only\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " verbatim_name taxon_group \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name _merge_approved \n", - "0 NaN left_only \n", - "1 NaN left_only \n", - "2 NaN left_only \n", - "3 NaN left_only \n", - "4 NaN left_only " + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 4\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Acanthaulax wilsonii\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " taxon_group genus species \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium kerguelense \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Acanthaulax granulata \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Acanthaulax wilsonii \n", + "\n", + " pbdb_taxon_id pbdb_taxon_name pbdb_taxon_rank genus_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " genus_taxon_name family_taxon_id family_taxon_name order_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " order_taxon_name class_taxon_id class_taxon_name phylum_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " phylum_taxon_name kingdom_taxon_id kingdom_taxon_name \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " unranked clade_taxon_id unranked clade_taxon_name \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN " ] }, - "execution_count": 250, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "merged_df = pd.read_csv(merged_path)\n", - "log_df(merged_df)" + "species_df = pd.read_csv(species_path, dtype=str)\n", + "\n", + "log_df(species_df)\n", + "# 5525" ] }, { "cell_type": "code", - "execution_count": 252, - "id": "0840253d", + "execution_count": 5, + "id": "ba097f67-692c-47a5-a95b-fe9ee1340c02", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(10099, 7)\n" + "(960, 19)\n" ] }, { @@ -2257,150 +2137,208 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", - " genus name\n", - " simplified_name\n", - " species name\n", - " subspecies name\n", - " _merge_approved\n", + " genus species\n", + " pbdb_taxon_id\n", + " pbdb_taxon_name\n", + " pbdb_taxon_rank\n", + " genus_taxon_id\n", + " genus_taxon_name\n", + " family_taxon_id\n", + " family_taxon_name\n", + " order_taxon_id\n", + " order_taxon_name\n", + " class_taxon_id\n", + " class_taxon_name\n", + " phylum_taxon_id\n", + " phylum_taxon_name\n", + " kingdom_taxon_id\n", + " kingdom_taxon_name\n", + " unranked clade_taxon_id\n", + " unranked clade_taxon_name\n", " \n", " \n", " \n", " \n", - " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", + " 5\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Achilleodinium bianii\n", + " 323992\n", + " Achilleodinium bianii\n", + " species\n", + " 323991\n", + " Achilleodinium\n", + " 321603\n", + " Gonyaulacaceae\n", + " 321606\n", + " Gonyaulacales\n", + " 321578\n", + " Dinophyceae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", - " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_forams\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", + " 15\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Achomosphaera ramulifera\n", + " 277049\n", + " Achomosphaera ramulifera\n", + " species\n", + " 277048\n", + " Achomosphaera\n", + " 321603\n", + " Gonyaulacaceae\n", + " 321606\n", + " Gonyaulacales\n", + " 321578\n", + " Dinophyceae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", - " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", + " 16\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Achomosphaera sagena\n", + " 323552\n", + " Achomosphaera sagena\n", + " species\n", + " 277048\n", + " Achomosphaera\n", + " 321603\n", + " Gonyaulacaceae\n", + " 321606\n", + " Gonyaulacales\n", + " 321578\n", + " Dinophyceae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", - " 3\n", - " Globanomalina planocompressa\n", - " planktic_forams\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", + " 17\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Achomosphaera triangulata\n", + " 323940\n", + " Achomosphaera triangulata\n", + " species\n", + " 277048\n", + " Achomosphaera\n", + " 321603\n", + " Gonyaulacaceae\n", + " 321606\n", + " Gonyaulacales\n", + " 321578\n", + " Dinophyceae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", - " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", + " 18\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Achomosphaera verdieri\n", + " 323553\n", + " Achomosphaera verdieri\n", + " species\n", + " 277048\n", + " Achomosphaera\n", + " 321603\n", + " Gonyaulacaceae\n", + " 321606\n", + " Gonyaulacales\n", + " 321578\n", + " Dinophyceae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name _merge_approved \n", - "0 NaN left_only \n", - "1 NaN left_only \n", - "2 NaN left_only \n", - "3 NaN left_only \n", - "4 NaN left_only " + " taxon_group genus species \\\n", + "5 Dinoflagellates/Acritarchs/Prasinophytes Achilleodinium bianii \n", + "15 Dinoflagellates/Acritarchs/Prasinophytes Achomosphaera ramulifera \n", + "16 Dinoflagellates/Acritarchs/Prasinophytes Achomosphaera sagena \n", + "17 Dinoflagellates/Acritarchs/Prasinophytes Achomosphaera triangulata \n", + "18 Dinoflagellates/Acritarchs/Prasinophytes Achomosphaera verdieri \n", + "\n", + " pbdb_taxon_id pbdb_taxon_name pbdb_taxon_rank genus_taxon_id \\\n", + "5 323992 Achilleodinium bianii species 323991 \n", + "15 277049 Achomosphaera ramulifera species 277048 \n", + "16 323552 Achomosphaera sagena species 277048 \n", + "17 323940 Achomosphaera triangulata species 277048 \n", + "18 323553 Achomosphaera verdieri species 277048 \n", + "\n", + " genus_taxon_name family_taxon_id family_taxon_name order_taxon_id \\\n", + "5 Achilleodinium 321603 Gonyaulacaceae 321606 \n", + "15 Achomosphaera 321603 Gonyaulacaceae 321606 \n", + "16 Achomosphaera 321603 Gonyaulacaceae 321606 \n", + "17 Achomosphaera 321603 Gonyaulacaceae 321606 \n", + "18 Achomosphaera 321603 Gonyaulacaceae 321606 \n", + "\n", + " order_taxon_name class_taxon_id class_taxon_name phylum_taxon_id \\\n", + "5 Gonyaulacales 321578 Dinophyceae NaN \n", + "15 Gonyaulacales 321578 Dinophyceae NaN \n", + "16 Gonyaulacales 321578 Dinophyceae NaN \n", + "17 Gonyaulacales 321578 Dinophyceae NaN \n", + "18 Gonyaulacales 321578 Dinophyceae NaN \n", + "\n", + " phylum_taxon_name kingdom_taxon_id kingdom_taxon_name \\\n", + "5 NaN NaN NaN \n", + "15 NaN NaN NaN \n", + "16 NaN NaN NaN \n", + "17 NaN NaN NaN \n", + "18 NaN NaN NaN \n", + "\n", + " unranked clade_taxon_id unranked clade_taxon_name \n", + "5 NaN NaN \n", + "15 NaN NaN \n", + "16 NaN NaN \n", + "17 NaN NaN \n", + "18 NaN NaN " ] }, - "execution_count": 252, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# (10099, 7)\n", - "unapproved_taxa_df = merged_df[merged_df['_merge_approved'] == 'left_only'].copy()\n", - "\n", - "log_df(unapproved_taxa_df)" - ] - }, - { - "cell_type": "markdown", - "id": "3a4b60c8", - "metadata": {}, - "source": [ - "create csv of unapproved NOAA taxa" - ] - }, - { - "cell_type": "code", - "execution_count": 253, - "id": "79e68854", - "metadata": {}, - "outputs": [], - "source": [ - "unapproved_taxa_df.to_csv(taxa_path, index=False)" - ] - }, - { - "cell_type": "markdown", - "id": "54bc2d22", - "metadata": {}, - "source": [ - "## Add PBDB data for taxa that aren't approved" - ] - }, - { - "cell_type": "markdown", - "id": "7a5a71e7", - "metadata": {}, - "source": [ - "Look up the genus for unapproved taxa in PBDB" + "species_df = species_df[species_df['pbdb_taxon_name'].notna() & (species_df['pbdb_taxon_rank'] == 'species')]\n", + "log_df(species_df)\n", + "# 960" ] }, { "cell_type": "code", - "execution_count": 254, - "id": "05044bb5", + "execution_count": 6, + "id": "4ed92744-a68b-4a03-916a-582d569c18fe", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(10099, 7)\n" + "(9024, 11)\n" ] }, { @@ -2424,204 +2362,179 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", + " verbatim_name\n", + " name\n", + " genus modifier\n", " genus name\n", - " simplified_name\n", + " species modifier\n", " species name\n", + " subspecies modifier\n", " subspecies name\n", - " _merge_approved\n", + " non-taxa descriptor\n", + " genus species\n", " \n", " \n", " \n", " \n", " 0\n", - " Talimudinium scissurum\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Talimudinium\n", - " Talimudinium scissurum\n", - " scissurum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Labyrinthodinium sp. 1\n", + " NaN\n", + " ?\n", + " Labyrinthodinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 1\n", - " Pseudoclavulina rugolosa\n", - " benthic_forams\n", - " Pseudoclavulina\n", - " Pseudoclavulina rugolosa\n", - " rugolosa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Maduradinium sp.\n", + " NaN\n", + " ?\n", + " Maduradinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 2\n", - " Pentadinium goniferum\n", - " nannofossils\n", - " Pentadinium\n", - " Pentadinium goniferum\n", - " goniferum\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Pyxidiella sp. 1\n", + " NaN\n", + " ?\n", + " Pyxidiella\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 3\n", - " Globanomalina planocompressa\n", - " planktic_forams\n", - " Globanomalina\n", - " Globanomalina planocompressa\n", - " planocompressa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella ivoirensis\n", " NaN\n", - " left_only\n", + " NaN\n", + " Aandalusiella\n", + " NaN\n", + " ivoirensis\n", + " NaN\n", + " NaN\n", + " NaN\n", + " Aandalusiella ivoirensis\n", " \n", " \n", " 4\n", - " Obliquipithonella multistrata\n", - " dinoflagellates/acritarchs/prasinophytes\n", - " Obliquipithonella\n", - " Obliquipithonella multistrata\n", - " multistrata\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium cardioforme\n", " NaN\n", - " left_only\n", + " NaN\n", + " Abratopdinium\n", + " NaN\n", + " cardioforme\n", + " NaN\n", + " NaN\n", + " NaN\n", + " Abratopdinium cardioforme\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group \\\n", - "0 Talimudinium scissurum dinoflagellates/acritarchs/prasinophytes \n", - "1 Pseudoclavulina rugolosa benthic_forams \n", - "2 Pentadinium goniferum nannofossils \n", - "3 Globanomalina planocompressa planktic_forams \n", - "4 Obliquipithonella multistrata dinoflagellates/acritarchs/prasinophytes \n", - "\n", - " genus name simplified_name species name \\\n", - "0 Talimudinium Talimudinium scissurum scissurum \n", - "1 Pseudoclavulina Pseudoclavulina rugolosa rugolosa \n", - "2 Pentadinium Pentadinium goniferum goniferum \n", - "3 Globanomalina Globanomalina planocompressa planocompressa \n", - "4 Obliquipithonella Obliquipithonella multistrata multistrata \n", - "\n", - " subspecies name _merge_approved \n", - "0 NaN left_only \n", - "1 NaN left_only \n", - "2 NaN left_only \n", - "3 NaN left_only \n", - "4 NaN left_only " + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor \\\n", + "0 NaN 1 NaN \n", + "1 NaN NaN NaN \n", + "2 NaN 1 NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " genus species \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 Aandalusiella ivoirensis \n", + "4 Abratopdinium cardioforme " ] }, - "execution_count": 254, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "unapproved_taxa_df = pd.read_csv(taxa_path)\n", - "log_df(unapproved_taxa_df)" + "unapproved_df = pd.read_csv(unapproved_taxa_path, dtype=str)\n", + "add_genus_species(unapproved_df)\n", + "log_df(unapproved_df)\n", + "# (9024, 11)\n" ] }, { - "cell_type": "markdown", - "id": "64869f8a", + "cell_type": "code", + "execution_count": 7, + "id": "5eda0977-1497-4ba8-8f07-7c5ddfa0a430", "metadata": {}, + "outputs": [], "source": [ - "create a dataframe of unique genera" + "pbdb.add_pbdb_data(unapproved_df, species_df, 'genus species')" ] }, { "cell_type": "code", - "execution_count": 260, - "id": "0698b2ac", + "execution_count": 8, + "id": "4deba606-2163-411d-b26b-c935a4bcbc82", "metadata": {}, "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "(2554, 1)\n" - ] - }, { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
genus name
0Talimudinium
1Pseudoclavulina
2Pentadinium
3Globanomalina
4Obliquipithonella
\n", - "
" - ], "text/plain": [ - " genus name\n", - "0 Talimudinium\n", - "1 Pseudoclavulina\n", - "2 Pentadinium\n", - "3 Globanomalina\n", - "4 Obliquipithonella" + "set()" ] }, - "execution_count": 260, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "unapproved_genus_df = pd.DataFrame(unapproved_taxa_df['genus name'].unique(), columns=['genus name'])\n", - "\n", - "log_df(unapproved_genus_df)" + "diff = set(species_df.columns) - set(unapproved_df.columns)\n", + "diff" ] }, { "cell_type": "code", - "execution_count": 261, - "id": "14a9f430", + "execution_count": 9, + "id": "8d36e60c-1ed9-4563-9bf1-504db8fddeaf", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(1707, 4)\n" + "(9024, 28)\n" ] }, { @@ -2645,253 +2558,250 @@ " \n", " \n", " \n", + " taxon_group\n", + " verbatim_name\n", + " name\n", + " genus modifier\n", " genus name\n", - " pbdb_taxon_id\n", - " pbdb_taxon_name\n", - " pbdb_taxon_rank\n", + " species modifier\n", + " species name\n", + " subspecies modifier\n", + " subspecies name\n", + " non-taxa descriptor\n", + " ...\n", + " class_taxon_id\n", + " class_taxon_name\n", + " phylum_taxon_id\n", + " phylum_taxon_name\n", + " kingdom_taxon_id\n", + " kingdom_taxon_name\n", + " genus_taxon_id\n", + " genus_taxon_name\n", + " unranked clade_taxon_id\n", + " unranked clade_taxon_name\n", " \n", " \n", " \n", " \n", " 0\n", - " Astromma\n", - " 33.0\n", - " Astromma\n", - " genus\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Labyrinthodinium sp. 1\n", + " NaN\n", + " ?\n", + " Labyrinthodinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 1\n", - " Lagena\n", - " 1739.0\n", - " Lagena\n", - " genus\n", - " \n", - " \n", - " 2\n", - " Cretarhabdus\n", - " 87816.0\n", - " Cretarhabdus\n", - " genus\n", - " \n", - " \n", - " 3\n", - " Fasciculithus\n", - " 424283.0\n", - " Fasciculithus\n", - " genus\n", - " \n", - " \n", - " 4\n", - " Coscinodiscus\n", - " 71292.0\n", - " Coscinodiscus\n", - " genus\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " genus name pbdb_taxon_id pbdb_taxon_name pbdb_taxon_rank\n", - "0 Astromma 33.0 Astromma genus\n", - "1 Lagena 1739.0 Lagena genus\n", - "2 Cretarhabdus 87816.0 Cretarhabdus genus\n", - "3 Fasciculithus 424283.0 Fasciculithus genus\n", - "4 Coscinodiscus 71292.0 Coscinodiscus genus" - ] - }, - "execution_count": 261, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "noaa_1_96_genus_df = pd.read_csv(noaa_1_96_genus_path)\n", - "log_df(noaa_1_96_genus_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 262, - "id": "77095aa7", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "1651" - ] - }, - "execution_count": 262, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "janus_genus = set(unapproved_genus_df['genus name']) - set(noaa_1_96_genus_df['genus name'])\n", - "len(janus_genus)" - ] - }, - { - "cell_type": "markdown", - "id": "6b445de8", - "metadata": {}, - "source": [ - "add pbdb taxa data" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "4040c0e4", - "metadata": {}, - "outputs": [], - "source": [ - "PBDB_API = \"https://paleobiodb.org/data1.2/\"\n", - "PBDB_TAXA = f\"{PBDB_API}taxa/single.json?vocab=pbdb&name=\"" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "6e200217", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "0 50 100 150 200 250 300 350 400 450 500 550 600 650 700 750 800 850 900 950 1000 1050 1100 1150 1200 1250 1300 1350 1400 1450 1500 1550 1600 1650 1700 " - ] - } - ], - "source": [ - "for index, row in genus_df.iterrows():\n", - " if index % 50 == 0:\n", - " print(index, end=' ')\n", - " \n", - " url = PBDB_TAXA + row['genus name']\n", - " response = requests.get(url)\n", - " \n", - " if response.status_code == 200:\n", - " data = response.json()[\"records\"]\n", - " if len(data) == 1:\n", - " # cast taxon_no to string to avoid pandas converting it to a float \n", - " genus_df.at[index, 'pbdb_taxon_id'] = str(data[0][\"taxon_no\"])\n", - " genus_df.at[index, 'pbdb_taxon_name'] = data[0][\"taxon_name\"]\n", - " genus_df.at[index, 'pbdb_taxon_rank'] = data[0][\"taxon_rank\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "51b8037e", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
genus name
0Dictyocha
1PlanorotalitesDinoflagellates/Acritarchs/Prasinophytes?Maduradinium sp.NaN?MaduradiniumNaNsp.NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2GloborotaliaDinoflagellates/Acritarchs/Prasinophytes?Pyxidiella sp. 1NaN?PyxidiellaNaNsp.NaN1NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3GloboquadrinaDinoflagellates/Acritarchs/PrasinophytesAandalusiella ivoirensisNaNNaNAandalusiellaNaNivoirensisNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4SpirocyrtisDinoflagellates/Acritarchs/PrasinophytesAbratopdinium cardioformeNaNNaNAbratopdiniumNaNcardioformeNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 28 columns

\n", "
" ], "text/plain": [ - " genus name\n", - "0 Dictyocha\n", - "1 Planorotalites\n", - "2 Globorotalia\n", - "3 Globoquadrina\n", - "4 Spirocyrtis" + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor ... class_taxon_id \\\n", + "0 NaN 1 NaN ... NaN \n", + "1 NaN NaN NaN ... NaN \n", + "2 NaN 1 NaN ... NaN \n", + "3 NaN NaN NaN ... NaN \n", + "4 NaN NaN NaN ... NaN \n", + "\n", + " class_taxon_name phylum_taxon_id phylum_taxon_name kingdom_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " kingdom_taxon_name genus_taxon_id genus_taxon_name unranked clade_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " unranked clade_taxon_name \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "\n", + "[5 rows x 28 columns]" ] }, - "execution_count": 36, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "genus_df.head()" + "log_df(unapproved_df)\n", + "# (9024, 28)" ] }, { - "cell_type": "markdown", - "id": "c43cbb29", + "cell_type": "code", + "execution_count": 10, + "id": "82210dde-89f8-43e9-bb9c-c63b47a41e76", "metadata": {}, + "outputs": [], "source": [ - "create genus csv" + "unapproved_df.to_csv(unapproved_taxa_path, index=False)" ] }, { "cell_type": "code", - "execution_count": 48, - "id": "28d0041a", + "execution_count": null, + "id": "03e5953b-a536-488e-8166-208c46b3c1d6", "metadata": {}, "outputs": [], - "source": [ - "genus_df.to_csv(genus_path, index=False)" - ] + "source": [] }, { "cell_type": "markdown", - "id": "03b9ddce", - "metadata": {}, + "id": "94566696-bd40-467d-9cc9-50ef67b291ef", + "metadata": { + "tags": [] + }, "source": [ - "## add pbdb info to unapproved taxa " + "## create genus csv" ] }, { "cell_type": "code", - "execution_count": 37, - "id": "4520a16a", + "execution_count": 26, + "id": "bb3a9951-3fd0-457a-86a9-19c614f5790f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(1707, 4)\n" + "(1026, 15)\n" ] }, { @@ -2915,82 +2825,169 @@ " \n", " \n", " \n", + " taxon_group\n", " genus name\n", " pbdb_taxon_id\n", " pbdb_taxon_name\n", " pbdb_taxon_rank\n", + " family_taxon_id\n", + " family_taxon_name\n", + " order_taxon_id\n", + " order_taxon_name\n", + " class_taxon_id\n", + " class_taxon_name\n", + " phylum_taxon_id\n", + " phylum_taxon_name\n", + " kingdom_taxon_id\n", + " kingdom_taxon_name\n", " \n", " \n", " \n", " \n", " 0\n", - " Astromma\n", - " 33\n", - " Astromma\n", + " benthic_forams\n", + " Euuvigerina\n", + " 1408\n", + " Euuvigerina\n", " genus\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 288974\n", + " Foraminifera\n", + " 212476\n", + " Rhizaria\n", " \n", " \n", " 1\n", - " Lagena\n", - " 1739\n", - " Lagena\n", + " benthic_forams\n", + " Nodosaria\n", + " 1952\n", + " Nodosaria\n", " genus\n", + " 82197\n", + " Nodosariidae\n", + " 429322\n", + " Nodosariida\n", + " 428875\n", + " Nodosariata\n", + " 288974\n", + " Foraminifera\n", + " 212476\n", + " Rhizaria\n", " \n", " \n", " 2\n", - " Cretarhabdus\n", - " 87816\n", - " Cretarhabdus\n", + " benthic_forams\n", + " Cibicides\n", + " 1107\n", + " Cibicides\n", " genus\n", + " 82208\n", + " Cibicididae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 288974\n", + " Foraminifera\n", + " 212476\n", + " Rhizaria\n", " \n", " \n", " 3\n", - " Fasciculithus\n", - " 424283\n", - " Fasciculithus\n", + " benthic_forams\n", + " Brizalina\n", + " 1017\n", + " Brizalina\n", " genus\n", + " 112279\n", + " Bolivinidae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 288974\n", + " Foraminifera\n", + " 212476\n", + " Rhizaria\n", " \n", " \n", " 4\n", - " Coscinodiscus\n", - " 71292\n", - " Coscinodiscus\n", + " planktic_forams\n", + " Candeina\n", + " 1053\n", + " Candeina\n", " genus\n", + " 422277\n", + " Candeinidae\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 288974\n", + " Foraminifera\n", + " 212476\n", + " Rhizaria\n", " \n", " \n", "\n", "" ], "text/plain": [ - " genus name pbdb_taxon_id pbdb_taxon_name pbdb_taxon_rank\n", - "0 Astromma 33 Astromma genus\n", - "1 Lagena 1739 Lagena genus\n", - "2 Cretarhabdus 87816 Cretarhabdus genus\n", - "3 Fasciculithus 424283 Fasciculithus genus\n", - "4 Coscinodiscus 71292 Coscinodiscus genus" + " taxon_group genus name pbdb_taxon_id pbdb_taxon_name pbdb_taxon_rank \\\n", + "0 benthic_forams Euuvigerina 1408 Euuvigerina genus \n", + "1 benthic_forams Nodosaria 1952 Nodosaria genus \n", + "2 benthic_forams Cibicides 1107 Cibicides genus \n", + "3 benthic_forams Brizalina 1017 Brizalina genus \n", + "4 planktic_forams Candeina 1053 Candeina genus \n", + "\n", + " family_taxon_id family_taxon_name order_taxon_id order_taxon_name \\\n", + "0 NaN NaN NaN NaN \n", + "1 82197 Nodosariidae 429322 Nodosariida \n", + "2 82208 Cibicididae NaN NaN \n", + "3 112279 Bolivinidae NaN NaN \n", + "4 422277 Candeinidae NaN NaN \n", + "\n", + " class_taxon_id class_taxon_name phylum_taxon_id phylum_taxon_name \\\n", + "0 NaN NaN 288974 Foraminifera \n", + "1 428875 Nodosariata 288974 Foraminifera \n", + "2 NaN NaN 288974 Foraminifera \n", + "3 NaN NaN 288974 Foraminifera \n", + "4 NaN NaN 288974 Foraminifera \n", + "\n", + " kingdom_taxon_id kingdom_taxon_name \n", + "0 212476 Rhizaria \n", + "1 212476 Rhizaria \n", + "2 212476 Rhizaria \n", + "3 212476 Rhizaria \n", + "4 212476 Rhizaria " ] }, - "execution_count": 37, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "genus_df = pd.read_csv(genus_path, dtype={'pbdb_taxon_id': str})\n", - "log_df(genus_df)" + "lims_genus_df = pd.read_csv(lims_genus_path, dtype=str)\n", + "log_df(lims_genus_df)" ] }, { "cell_type": "code", - "execution_count": 38, - "id": "fd5d36dc", + "execution_count": 27, + "id": "43d8ab9c-2759-48ae-9d67-f44018cd4c9f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(7763, 7)\n" + "(9024, 28)\n" ] }, { @@ -3014,123 +3011,222 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", + " verbatim_name\n", + " name\n", + " genus modifier\n", " genus name\n", - " simplified_name\n", + " species modifier\n", " species name\n", + " subspecies modifier\n", " subspecies name\n", - " _merge_approved\n", + " non-taxa descriptor\n", + " ...\n", + " class_taxon_id\n", + " class_taxon_name\n", + " phylum_taxon_id\n", + " phylum_taxon_name\n", + " kingdom_taxon_id\n", + " kingdom_taxon_name\n", + " genus_taxon_id\n", + " genus_taxon_name\n", + " unranked clade_taxon_id\n", + " unranked clade_taxon_name\n", " \n", " \n", " \n", " \n", " 0\n", - " Dictyocha brevispina brevispina (q)\n", - " silicoflagellates\n", - " Dictyocha\n", - " Dictyocha brevispina brevispina\n", - " brevispina\n", - " brevispina\n", - " left_only\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Labyrinthodinium sp. 1\n", + " NaN\n", + " ?\n", + " Labyrinthodinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " 1\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " \n", " \n", " 1\n", - " Planorotalites ehrenbergi\n", - " planktic_forams\n", - " Planorotalites\n", - " Planorotalites ehrenbergi\n", - " ehrenbergi\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Maduradinium sp.\n", + " NaN\n", + " ?\n", + " Maduradinium\n", + " NaN\n", + " sp.\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " left_only\n", " \n", " \n", " 2\n", - " Globorotalia miozea sphericomiozea\n", - " planktic_forams\n", - " Globorotalia\n", - " Globorotalia miozea sphericomiozea\n", - " miozea\n", - " sphericomiozea\n", - " left_only\n", - " \n", - " \n", - " 3\n", - " Globoquadrina globosa\n", - " planktic_forams\n", - " Globoquadrina\n", - " Globoquadrina globosa\n", - " globosa\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " ?Pyxidiella sp. 1\n", " NaN\n", - " left_only\n", - " \n", - " \n", - " 4\n", - " Spirocyrtis scalaris\n", - " radiolarians\n", - " Spirocyrtis\n", - " Spirocyrtis scalaris\n", - " scalaris\n", + " ?\n", + " Pyxidiella\n", " NaN\n", - " left_only\n", - " \n", - " \n", + " sp.\n", + " NaN\n", + " 1\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 3\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella ivoirensis\n", + " NaN\n", + " NaN\n", + " Aandalusiella\n", + " NaN\n", + " ivoirensis\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", + " 4\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium cardioforme\n", + " NaN\n", + " NaN\n", + " Abratopdinium\n", + " NaN\n", + " cardioforme\n", + " NaN\n", + " NaN\n", + " NaN\n", + " ...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " \n", + " \n", "\n", + "

5 rows × 28 columns

\n", "" ], "text/plain": [ - " verbatim_name taxon_group genus name \\\n", - "0 Dictyocha brevispina brevispina (q) silicoflagellates Dictyocha \n", - "1 Planorotalites ehrenbergi planktic_forams Planorotalites \n", - "2 Globorotalia miozea sphericomiozea planktic_forams Globorotalia \n", - "3 Globoquadrina globosa planktic_forams Globoquadrina \n", - "4 Spirocyrtis scalaris radiolarians Spirocyrtis \n", - "\n", - " simplified_name species name subspecies name \\\n", - "0 Dictyocha brevispina brevispina brevispina brevispina \n", - "1 Planorotalites ehrenbergi ehrenbergi NaN \n", - "2 Globorotalia miozea sphericomiozea miozea sphericomiozea \n", - "3 Globoquadrina globosa globosa NaN \n", - "4 Spirocyrtis scalaris scalaris NaN \n", - "\n", - " _merge_approved \n", - "0 left_only \n", - "1 left_only \n", - "2 left_only \n", - "3 left_only \n", - "4 left_only " + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor ... class_taxon_id \\\n", + "0 NaN 1 NaN ... NaN \n", + "1 NaN NaN NaN ... NaN \n", + "2 NaN 1 NaN ... NaN \n", + "3 NaN NaN NaN ... NaN \n", + "4 NaN NaN NaN ... NaN \n", + "\n", + " class_taxon_name phylum_taxon_id phylum_taxon_name kingdom_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " kingdom_taxon_name genus_taxon_id genus_taxon_name \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " unranked clade_taxon_id unranked clade_taxon_name \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", + "\n", + "[5 rows x 28 columns]" ] }, - "execution_count": 38, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "unapproved_df = pd.read_csv(taxa_path)\n", - "\n", - "log_df(unapproved_df)" - ] - }, - { - "cell_type": "markdown", - "id": "7745c14d", - "metadata": {}, - "source": [ - "merge NOAA unapproved taxa with pbdb data" + "df = pd.read_csv(unapproved_taxa_path)\n", + "log_df(df)" ] }, { "cell_type": "code", - "execution_count": 39, - "id": "e67b404a", + "execution_count": 28, + "id": "e19d4947-5273-4eba-bed3-c57eb88556b8", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(7763, 11)\n" + "(2545, 3)\n" ] }, { @@ -3154,150 +3250,79 @@ " \n", " \n", " \n", - " verbatim_name\n", " taxon_group\n", " genus name\n", - " simplified_name\n", - " species name\n", - " subspecies name\n", - " _merge_approved\n", - " pbdb_taxon_id\n", - " pbdb_taxon_name\n", - " pbdb_taxon_rank\n", - " _merge_pbdb\n", + " check\n", " \n", " \n", " \n", " \n", " 0\n", - " Dictyocha brevispina brevispina (q)\n", - " silicoflagellates\n", - " Dictyocha\n", - " Dictyocha brevispina brevispina\n", - " brevispina\n", - " brevispina\n", - " left_only\n", - " 71284\n", - " Dictyocha\n", - " genus\n", - " both\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Labyrinthodinium\n", + " False\n", " \n", " \n", " 1\n", - " Planorotalites ehrenbergi\n", - " planktic_forams\n", - " Planorotalites\n", - " Planorotalites ehrenbergi\n", - " ehrenbergi\n", - " NaN\n", - " left_only\n", - " 2146\n", - " Planorotalites\n", - " genus\n", - " both\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Maduradinium\n", + " False\n", " \n", " \n", " 2\n", - " Globorotalia miozea sphericomiozea\n", - " planktic_forams\n", - " Globorotalia\n", - " Globorotalia miozea sphericomiozea\n", - " miozea\n", - " sphericomiozea\n", - " left_only\n", - " 1521\n", - " Globorotalia\n", - " genus\n", - " both\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Pyxidiella\n", + " False\n", " \n", " \n", " 3\n", - " Globoquadrina globosa\n", - " planktic_forams\n", - " Globoquadrina\n", - " Globoquadrina globosa\n", - " globosa\n", - " NaN\n", - " left_only\n", - " 1518\n", - " Globoquadrina\n", - " genus\n", - " both\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella\n", + " False\n", " \n", " \n", " 4\n", - " Spirocyrtis scalaris\n", - " radiolarians\n", - " Spirocyrtis\n", - " Spirocyrtis scalaris\n", - " scalaris\n", - " NaN\n", - " left_only\n", - " 654\n", - " Spirocyrtis\n", - " genus\n", - " both\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium\n", + " False\n", " \n", " \n", "\n", "" ], "text/plain": [ - " verbatim_name taxon_group genus name \\\n", - "0 Dictyocha brevispina brevispina (q) silicoflagellates Dictyocha \n", - "1 Planorotalites ehrenbergi planktic_forams Planorotalites \n", - "2 Globorotalia miozea sphericomiozea planktic_forams Globorotalia \n", - "3 Globoquadrina globosa planktic_forams Globoquadrina \n", - "4 Spirocyrtis scalaris radiolarians Spirocyrtis \n", - "\n", - " simplified_name species name subspecies name \\\n", - "0 Dictyocha brevispina brevispina brevispina brevispina \n", - "1 Planorotalites ehrenbergi ehrenbergi NaN \n", - "2 Globorotalia miozea sphericomiozea miozea sphericomiozea \n", - "3 Globoquadrina globosa globosa NaN \n", - "4 Spirocyrtis scalaris scalaris NaN \n", - "\n", - " _merge_approved pbdb_taxon_id pbdb_taxon_name pbdb_taxon_rank _merge_pbdb \n", - "0 left_only 71284 Dictyocha genus both \n", - "1 left_only 2146 Planorotalites genus both \n", - "2 left_only 1521 Globorotalia genus both \n", - "3 left_only 1518 Globoquadrina genus both \n", - "4 left_only 654 Spirocyrtis genus both " + " taxon_group genus name check\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes Labyrinthodinium False\n", + "1 Dinoflagellates/Acritarchs/Prasinophytes Maduradinium False\n", + "2 Dinoflagellates/Acritarchs/Prasinophytes Pyxidiella False\n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella False\n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium False" ] }, - "execution_count": 39, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "merged_df = pd.merge(unapproved_taxa_df, genus_df, \n", - " on = 'genus name', \n", - " how='left',\n", - " indicator='_merge_pbdb')\n", + "genus_df = df[df['genus name'].notna()].copy()[['taxon_group', 'genus name']]\n", + "genus_df.drop_duplicates(inplace=True)\n", + "genus_df['check'] = False\n", "\n", - "log_df(merged_df)" - ] - }, - { - "cell_type": "markdown", - "id": "927bfcff", - "metadata": {}, - "source": [ - "reorder columns and sort rows" + "log_df(genus_df)" ] }, { "cell_type": "code", - "execution_count": 40, - "id": "002253fa", + "execution_count": 29, + "id": "7e227bcd-2c75-4dbb-b259-673799cfb011", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "(7763, 11)\n" + "(2545, 16)\n" ] }, { @@ -3322,157 +3347,1984 @@ " \n", " \n", " taxon_group\n", - " verbatim_name\n", " genus name\n", - " species name\n", - " subspecies name\n", + " check\n", " pbdb_taxon_id\n", " pbdb_taxon_name\n", " pbdb_taxon_rank\n", - " _simplified_name\n", - " _merge_approved\n", - " _merge_pbdb\n", + " family_taxon_id\n", + " family_taxon_name\n", + " order_taxon_id\n", + " order_taxon_name\n", + " class_taxon_id\n", + " class_taxon_name\n", + " phylum_taxon_id\n", + " phylum_taxon_name\n", + " kingdom_taxon_id\n", + " kingdom_taxon_name\n", " \n", " \n", " \n", " \n", - " 822\n", - " benthic_forams\n", - " Abyssamina incisa\n", - " Abyssamina\n", - " incisa\n", + " 0\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Labyrinthodinium\n", + " False\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 762\n", - " Abyssamina\n", - " genus\n", " NaN\n", - " left_only\n", - " both\n", " \n", " \n", - " 6100\n", - " benthic_forams\n", - " Adercotryma glomeratum\n", - " Adercotryma\n", - " glomeratum\n", + " 1\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Maduradinium\n", + " False\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 774\n", - " Adercotryma\n", - " genus\n", " NaN\n", - " left_only\n", - " both\n", " \n", " \n", - " 2822\n", - " benthic_forams\n", - " Adercotryma sp.\n", - " Adercotryma\n", - " sp.\n", + " 2\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Pyxidiella\n", + " False\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 774\n", - " Adercotryma\n", - " genus\n", " NaN\n", - " left_only\n", - " both\n", " \n", " \n", - " 6167\n", - " benthic_forams\n", - " Alabamina decorata\n", - " Alabamina\n", - " decorata\n", + " 3\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Aandalusiella\n", + " False\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 788\n", - " Alabamina\n", - " genus\n", " NaN\n", - " left_only\n", - " both\n", " \n", " \n", - " 3517\n", - " benthic_forams\n", - " Alabamina haitiensis\n", - " Alabamina\n", - " haitiensis\n", + " 4\n", + " Dinoflagellates/Acritarchs/Prasinophytes\n", + " Abratopdinium\n", + " False\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", " NaN\n", - " 788\n", - " Alabamina\n", - " genus\n", " NaN\n", - " left_only\n", - " both\n", " \n", " \n", "\n", "" ], "text/plain": [ - " taxon_group verbatim_name genus name species name \\\n", - "822 benthic_forams Abyssamina incisa Abyssamina incisa \n", - "6100 benthic_forams Adercotryma glomeratum Adercotryma glomeratum \n", - "2822 benthic_forams Adercotryma sp. Adercotryma sp. \n", - "6167 benthic_forams Alabamina decorata Alabamina decorata \n", - "3517 benthic_forams Alabamina haitiensis Alabamina haitiensis \n", - "\n", - " subspecies name pbdb_taxon_id pbdb_taxon_name pbdb_taxon_rank \\\n", - "822 NaN 762 Abyssamina genus \n", - "6100 NaN 774 Adercotryma genus \n", - "2822 NaN 774 Adercotryma genus \n", - "6167 NaN 788 Alabamina genus \n", - "3517 NaN 788 Alabamina genus \n", - "\n", - " _simplified_name _merge_approved _merge_pbdb \n", - "822 NaN left_only both \n", - "6100 NaN left_only both \n", - "2822 NaN left_only both \n", - "6167 NaN left_only both \n", - "3517 NaN left_only both " + " taxon_group genus name check \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes Labyrinthodinium False \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes Maduradinium False \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes Pyxidiella False \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella False \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium False \n", + "\n", + " pbdb_taxon_id pbdb_taxon_name pbdb_taxon_rank family_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " family_taxon_name order_taxon_id order_taxon_name class_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " class_taxon_name phylum_taxon_id phylum_taxon_name kingdom_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " kingdom_taxon_name \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN " ] }, - "execution_count": 40, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "merged_df = merged_df.reindex(columns=['taxon_group', 'verbatim_name', \n", - " 'genus name', 'species name', \n", - " 'subspecies name',\n", - " 'pbdb_taxon_id', 'pbdb_taxon_name',\n", - " 'pbdb_taxon_rank', '_simplified_name',\n", - " '_merge_approved', '_merge_pbdb'\n", - " ])\n", - "\n", - "merged_df.sort_values(by=['taxon_group', 'verbatim_name'], inplace=True)\n", + "genus_df = genus_df.merge(lims_genus_df, how='left')\n", + "genus_df.loc[genus_df['pbdb_taxon_id'].notna(), 'check'] = True\n", "\n", - "log_df(merged_df)" + "log_df(genus_df)" ] }, { - "cell_type": "markdown", - "id": "21f1d0b0", + "cell_type": "code", + "execution_count": 99, + "id": "068d2343-f7f4-42cb-9aee-2001bde9feb7", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Index(['taxon_group', 'genus name', 'check', 'pbdb_taxon_id',\n", + " 'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id',\n", + " 'family_taxon_name', 'order_taxon_id', 'order_taxon_name',\n", + " 'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',\n", + " 'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',\n", + " 'unranked clade_taxon_id', 'unranked clade_taxon_name',\n", + " 'subclass_taxon_id', 'subclass_taxon_name', 'genus_taxon_id',\n", + " 'genus_taxon_name'],\n", + " dtype='object')\n", + "800 1000 1350 1750 1800 1850 2100 2150 2200 2250 2300 2350 2400 2450 2500 " + ] + } + ], + "source": [ + "pbdb.fetch_pdbd_data(genus_df, 'genus name')" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "43e9d5a0-35ba-4a29-a722-61b377897f31", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(0, 22)" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "save csv" + "genus_df[genus_df['check'] == False].shape" ] }, { "cell_type": "code", - "execution_count": 41, - "id": "e786ddfb", + "execution_count": 107, + "id": "8f27c9a1-3ed5-49e3-b260-336dfcd65575", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2545, 22)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
taxon_groupgenus namecheckpbdb_taxon_idpbdb_taxon_namepbdb_taxon_rankfamily_taxon_idfamily_taxon_nameorder_taxon_idorder_taxon_name...phylum_taxon_idphylum_taxon_namekingdom_taxon_idkingdom_taxon_nameunranked clade_taxon_idunranked clade_taxon_namesubclass_taxon_idsubclass_taxon_namegenus_taxon_idgenus_taxon_name
0Dinoflagellates/Acritarchs/PrasinophytesLabyrinthodiniumTrue443826LabyrinthodiniumgenusNaNNaN321606Gonyaulacales...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1Dinoflagellates/Acritarchs/PrasinophytesMaduradiniumTrue325673Maduradiniumgenus277915Peridiniaceae277919Peridiniales...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2Dinoflagellates/Acritarchs/PrasinophytesPyxidiellaTrue336773Pyxidiellagenus277915Peridiniaceae277919Peridiniales...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3Dinoflagellates/Acritarchs/PrasinophytesAandalusiellaTrueNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4Dinoflagellates/Acritarchs/PrasinophytesAbratopdiniumTrueNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " taxon_group genus name check \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes Labyrinthodinium True \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes Maduradinium True \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes Pyxidiella True \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella True \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium True \n", + "\n", + " pbdb_taxon_id pbdb_taxon_name pbdb_taxon_rank family_taxon_id \\\n", + "0 443826 Labyrinthodinium genus NaN \n", + "1 325673 Maduradinium genus 277915 \n", + "2 336773 Pyxidiella genus 277915 \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " family_taxon_name order_taxon_id order_taxon_name ... phylum_taxon_id \\\n", + "0 NaN 321606 Gonyaulacales ... NaN \n", + "1 Peridiniaceae 277919 Peridiniales ... NaN \n", + "2 Peridiniaceae 277919 Peridiniales ... NaN \n", + "3 NaN NaN NaN ... NaN \n", + "4 NaN NaN NaN ... NaN \n", + "\n", + " phylum_taxon_name kingdom_taxon_id kingdom_taxon_name \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " unranked clade_taxon_id unranked clade_taxon_name subclass_taxon_id \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " subclass_taxon_name genus_taxon_id genus_taxon_name \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + "[5 rows x 22 columns]" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "log_df(genus_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "bb622f76-0b49-47cd-a3d2-8ed1b9443f24", + "metadata": {}, + "outputs": [], + "source": [ + "del genus_df['check']" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "7a68c28f-8536-413a-b2c5-d1736159daf5", + "metadata": {}, + "outputs": [], + "source": [ + "genus_df.to_csv(genus_path, index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "03b9ddce", + "metadata": { + "tags": [] + }, + "source": [ + "## add genus pbdb info to unapproved taxa " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4520a16a", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(2545, 21)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
taxon_groupgenus namepbdb_taxon_idpbdb_taxon_namepbdb_taxon_rankfamily_taxon_idfamily_taxon_nameorder_taxon_idorder_taxon_nameclass_taxon_id...phylum_taxon_idphylum_taxon_namekingdom_taxon_idkingdom_taxon_nameunranked clade_taxon_idunranked clade_taxon_namesubclass_taxon_idsubclass_taxon_namegenus_taxon_idgenus_taxon_name
0Dinoflagellates/Acritarchs/PrasinophytesLabyrinthodinium443826LabyrinthodiniumgenusNaNNaN321606Gonyaulacales321578...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1Dinoflagellates/Acritarchs/PrasinophytesMaduradinium325673Maduradiniumgenus277915Peridiniaceae277919Peridiniales321578...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2Dinoflagellates/Acritarchs/PrasinophytesPyxidiella336773Pyxidiellagenus277915Peridiniaceae277919Peridiniales321578...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3Dinoflagellates/Acritarchs/PrasinophytesAandalusiellaNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4Dinoflagellates/Acritarchs/PrasinophytesAbratopdiniumNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " taxon_group genus name pbdb_taxon_id \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes Labyrinthodinium 443826 \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes Maduradinium 325673 \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes Pyxidiella 336773 \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium NaN \n", + "\n", + " pbdb_taxon_name pbdb_taxon_rank family_taxon_id family_taxon_name \\\n", + "0 Labyrinthodinium genus NaN NaN \n", + "1 Maduradinium genus 277915 Peridiniaceae \n", + "2 Pyxidiella genus 277915 Peridiniaceae \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " order_taxon_id order_taxon_name class_taxon_id ... phylum_taxon_id \\\n", + "0 321606 Gonyaulacales 321578 ... NaN \n", + "1 277919 Peridiniales 321578 ... NaN \n", + "2 277919 Peridiniales 321578 ... NaN \n", + "3 NaN NaN NaN ... NaN \n", + "4 NaN NaN NaN ... NaN \n", + "\n", + " phylum_taxon_name kingdom_taxon_id kingdom_taxon_name \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " unranked clade_taxon_id unranked clade_taxon_name subclass_taxon_id \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " subclass_taxon_name genus_taxon_id genus_taxon_name \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "genus_df = pd.read_csv(genus_path, dtype=str)\n", + "log_df(genus_df)\n", + "# (2545, 21)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "fd5d36dc", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(9024, 28)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
taxon_groupverbatim_namenamegenus modifiergenus namespecies modifierspecies namesubspecies modifiersubspecies namenon-taxa descriptor...class_taxon_idclass_taxon_namephylum_taxon_idphylum_taxon_namekingdom_taxon_idkingdom_taxon_namegenus_taxon_idgenus_taxon_nameunranked clade_taxon_idunranked clade_taxon_name
0Dinoflagellates/Acritarchs/Prasinophytes?Labyrinthodinium sp. 1NaN?LabyrinthodiniumNaNsp.NaN1NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1Dinoflagellates/Acritarchs/Prasinophytes?Maduradinium sp.NaN?MaduradiniumNaNsp.NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2Dinoflagellates/Acritarchs/Prasinophytes?Pyxidiella sp. 1NaN?PyxidiellaNaNsp.NaN1NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3Dinoflagellates/Acritarchs/PrasinophytesAandalusiella ivoirensisNaNNaNAandalusiellaNaNivoirensisNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4Dinoflagellates/Acritarchs/PrasinophytesAbratopdinium cardioformeNaNNaNAbratopdiniumNaNcardioformeNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor ... class_taxon_id \\\n", + "0 NaN 1 NaN ... NaN \n", + "1 NaN NaN NaN ... NaN \n", + "2 NaN 1 NaN ... NaN \n", + "3 NaN NaN NaN ... NaN \n", + "4 NaN NaN NaN ... NaN \n", + "\n", + " class_taxon_name phylum_taxon_id phylum_taxon_name kingdom_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " kingdom_taxon_name genus_taxon_id genus_taxon_name unranked clade_taxon_id \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " unranked clade_taxon_name \n", + "0 NaN \n", + "1 NaN \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unapproved_df = pd.read_csv(unapproved_taxa_path, dtype=str)\n", + "\n", + "log_df(unapproved_df)\n", + "# (9024, 28)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "e67b404a", + "metadata": {}, + "outputs": [], + "source": [ + "pbdb.add_pbdb_data(unapproved_df, genus_df, 'genus name')" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2ea97042-cd3d-4764-b4ed-c653fc8b1071", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "set()" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "diff = set(genus_df.columns) - set(unapproved_df.columns)\n", + "diff" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "0147edaa-143b-4fdd-a272-96956f0abd0f", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(9024, 30)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
taxon_groupverbatim_namenamegenus modifiergenus namespecies modifierspecies namesubspecies modifiersubspecies namenon-taxa descriptor...phylum_taxon_idphylum_taxon_namekingdom_taxon_idkingdom_taxon_namegenus_taxon_idgenus_taxon_nameunranked clade_taxon_idunranked clade_taxon_namesubclass_taxon_idsubclass_taxon_name
0Dinoflagellates/Acritarchs/Prasinophytes?Labyrinthodinium sp. 1NaN?LabyrinthodiniumNaNsp.NaN1NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1Dinoflagellates/Acritarchs/Prasinophytes?Maduradinium sp.NaN?MaduradiniumNaNsp.NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2Dinoflagellates/Acritarchs/Prasinophytes?Pyxidiella sp. 1NaN?PyxidiellaNaNsp.NaN1NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3Dinoflagellates/Acritarchs/PrasinophytesAandalusiella ivoirensisNaNNaNAandalusiellaNaNivoirensisNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4Dinoflagellates/Acritarchs/PrasinophytesAbratopdinium cardioformeNaNNaNAbratopdiniumNaNcardioformeNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor ... \\\n", + "0 NaN 1 NaN ... \n", + "1 NaN NaN NaN ... \n", + "2 NaN 1 NaN ... \n", + "3 NaN NaN NaN ... \n", + "4 NaN NaN NaN ... \n", + "\n", + " phylum_taxon_id phylum_taxon_name kingdom_taxon_id kingdom_taxon_name \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " genus_taxon_id genus_taxon_name unranked clade_taxon_id \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " unranked clade_taxon_name subclass_taxon_id subclass_taxon_name \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + "[5 rows x 30 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "log_df(unapproved_df)\n", + "# (9024, 30)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "2c74acf3-f686-4baa-b552-7bba60f4a8f4", + "metadata": {}, + "outputs": [], + "source": [ + "unapproved_df.to_csv(unapproved_taxa_path, index=False)" + ] + }, + { + "cell_type": "markdown", + "id": "78db36fa-f08b-4c79-8776-ed737dafe609", + "metadata": {}, + "source": [ + "## update columns in taxa list" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "76a1c585-e766-4d17-a89e-0c805ca46545", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(9024, 30)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
taxon_groupverbatim_namenamegenus modifiergenus namespecies modifierspecies namesubspecies modifiersubspecies namenon-taxa descriptor...phylum_taxon_idphylum_taxon_namekingdom_taxon_idkingdom_taxon_namegenus_taxon_idgenus_taxon_nameunranked clade_taxon_idunranked clade_taxon_namesubclass_taxon_idsubclass_taxon_name
0Dinoflagellates/Acritarchs/Prasinophytes?Labyrinthodinium sp. 1NaN?LabyrinthodiniumNaNsp.NaN1NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
1Dinoflagellates/Acritarchs/Prasinophytes?Maduradinium sp.NaN?MaduradiniumNaNsp.NaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
2Dinoflagellates/Acritarchs/Prasinophytes?Pyxidiella sp. 1NaN?PyxidiellaNaNsp.NaN1NaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
3Dinoflagellates/Acritarchs/PrasinophytesAandalusiella ivoirensisNaNNaNAandalusiellaNaNivoirensisNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4Dinoflagellates/Acritarchs/PrasinophytesAbratopdinium cardioformeNaNNaNAbratopdiniumNaNcardioformeNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

5 rows × 30 columns

\n", + "
" + ], + "text/plain": [ + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor ... \\\n", + "0 NaN 1 NaN ... \n", + "1 NaN NaN NaN ... \n", + "2 NaN 1 NaN ... \n", + "3 NaN NaN NaN ... \n", + "4 NaN NaN NaN ... \n", + "\n", + " phylum_taxon_id phylum_taxon_name kingdom_taxon_id kingdom_taxon_name \\\n", + "0 NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN \n", + "\n", + " genus_taxon_id genus_taxon_name unranked clade_taxon_id \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + " unranked clade_taxon_name subclass_taxon_id subclass_taxon_name \n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", + "\n", + "[5 rows x 30 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unapproved_df = pd.read_csv(unapproved_taxa_path, dtype=str)\n", + "\n", + "log_df(unapproved_df)\n", + "# (9024, 30)" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "16154a85-59a0-46a6-9f4b-1e76fca33364", + "metadata": {}, + "outputs": [], + "source": [ + "PI_df = pd.read_csv(PI_noaa_1_96_taxa_path, dtype=str)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "5ea4a13b-613e-4332-84c7-31431625dec9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'Any taxon above genus',\n", + " 'Any taxon above genus modifier',\n", + " 'Comment',\n", + " 'Corrections to pbdb_taxon',\n", + " 'Corrections to pbdb_taxon_rank',\n", + " 'Notes (change to Internal only notes?)',\n", + " 'comments',\n", + " 'subgenera modifier',\n", + " 'subgenera name',\n", + " 'superfamily_taxon_id',\n", + " 'superfamily_taxon_name'}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "set([col.replace('.1', '') for col in PI_df.columns]) - set(unapproved_df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "01cb5202-564e-4993-9f55-6fc3cdb63cf8", + "metadata": {}, + "outputs": [], + "source": [ + "unapproved_df['Any taxon above genus'] = pd.NA\n", + "unapproved_df['Any taxon above genus modifier'] = pd.NA\n", + "unapproved_df['Comment'] = pd.NA\n", + "unapproved_df['Corrections to pbdb_taxon'] = pd.NA\n", + "unapproved_df['Notes (change to Internal only notes?)'] = pd.NA\n", + "unapproved_df['comments'] = pd.NA\n", + "unapproved_df['subgenera modifier'] = pd.NA\n", + "unapproved_df['subgenera name'] = pd.NA\n", + "unapproved_df['species_taxon_id'] = pd.NA\n", + "unapproved_df['species_taxon_name'] = pd.NA\n", + "\n", + "del unapproved_df['genus species']" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "6a88494f-cfb6-4615-9fd3-fbf4111f3c00", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "39" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(unapproved_df.columns)\n", + "# 39" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "16f08907-cab5-4f3a-8af3-ed5451b6d78b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(9024, 39)\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
taxon_groupverbatim_namenamegenus modifiergenus namespecies modifierspecies namesubspecies modifiersubspecies namenon-taxa descriptor...Any taxon above genusAny taxon above genus modifierCommentCorrections to pbdb_taxonNotes (change to Internal only notes?)commentssubgenera modifiersubgenera namespecies_taxon_idspecies_taxon_name
0Dinoflagellates/Acritarchs/Prasinophytes?Labyrinthodinium sp. 1NaN?LabyrinthodiniumNaNsp.NaN1NaN...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
1Dinoflagellates/Acritarchs/Prasinophytes?Maduradinium sp.NaN?MaduradiniumNaNsp.NaNNaNNaN...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
2Dinoflagellates/Acritarchs/Prasinophytes?Pyxidiella sp. 1NaN?PyxidiellaNaNsp.NaN1NaN...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
3Dinoflagellates/Acritarchs/PrasinophytesAandalusiella ivoirensisNaNNaNAandalusiellaNaNivoirensisNaNNaNNaN...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
4Dinoflagellates/Acritarchs/PrasinophytesAbratopdinium cardioformeNaNNaNAbratopdiniumNaNcardioformeNaNNaNNaN...<NA><NA><NA><NA><NA><NA><NA><NA><NA><NA>
\n", + "

5 rows × 39 columns

\n", + "
" + ], + "text/plain": [ + " taxon_group verbatim_name name \\\n", + "0 Dinoflagellates/Acritarchs/Prasinophytes ?Labyrinthodinium sp. 1 NaN \n", + "1 Dinoflagellates/Acritarchs/Prasinophytes ?Maduradinium sp. NaN \n", + "2 Dinoflagellates/Acritarchs/Prasinophytes ?Pyxidiella sp. 1 NaN \n", + "3 Dinoflagellates/Acritarchs/Prasinophytes Aandalusiella ivoirensis NaN \n", + "4 Dinoflagellates/Acritarchs/Prasinophytes Abratopdinium cardioforme NaN \n", + "\n", + " genus modifier genus name species modifier species name \\\n", + "0 ? Labyrinthodinium NaN sp. \n", + "1 ? Maduradinium NaN sp. \n", + "2 ? Pyxidiella NaN sp. \n", + "3 NaN Aandalusiella NaN ivoirensis \n", + "4 NaN Abratopdinium NaN cardioforme \n", + "\n", + " subspecies modifier subspecies name non-taxa descriptor ... \\\n", + "0 NaN 1 NaN ... \n", + "1 NaN NaN NaN ... \n", + "2 NaN 1 NaN ... \n", + "3 NaN NaN NaN ... \n", + "4 NaN NaN NaN ... \n", + "\n", + " Any taxon above genus Any taxon above genus modifier Comment \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "\n", + " Corrections to pbdb_taxon Notes (change to Internal only notes?) comments \\\n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "\n", + " subgenera modifier subgenera name species_taxon_id species_taxon_name \n", + "0 \n", + "1 \n", + "2 \n", + "3 \n", + "4 \n", + "\n", + "[5 rows x 39 columns]" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "for rank in ['species', 'genus', 'family', 'order', 'class', 'phylum']:\n", + " unapproved_df.loc[unapproved_df['pbdb_taxon_rank'] == rank, f'{rank}_taxon_name' ] = unapproved_df['pbdb_taxon_name']\n", + " unapproved_df.loc[unapproved_df['pbdb_taxon_rank'] == rank, f'{rank}_taxon_id' ] = unapproved_df['pbdb_taxon_id']\n", + "\n", + "log_df(unapproved_df)\n", + "# (9024, 39)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "5613b97e-4ec2-41f8-9393-87c329dee970", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['taxon_group', 'verbatim_name', 'name', 'Comment',\n", + " 'Notes (change to Internal only notes?)',\n", + " 'Any taxon above genus modifier', 'Any taxon above genus',\n", + " 'genus modifier', 'genus name', 'subgenera modifier', 'subgenera name',\n", + " 'species modifier', 'species name', 'subspecies modifier',\n", + " 'subspecies name', 'non-taxa descriptor', 'comments', 'pbdb_taxon_id',\n", + " 'pbdb_taxon_name', 'pbdb_taxon_rank', 'Corrections to pbdb_taxon_rank',\n", + " 'pbdb_taxon_id.1', 'pbdb_taxon_name.1', 'pbdb_taxon_rank.1',\n", + " 'Corrections to pbdb_taxon', 'family_taxon_id', 'family_taxon_name',\n", + " 'superfamily_taxon_id', 'superfamily_taxon_name', 'order_taxon_id',\n", + " 'order_taxon_name', 'class_taxon_id', 'class_taxon_name',\n", + " 'phylum_taxon_id', 'phylum_taxon_name', 'kingdom_taxon_id',\n", + " 'kingdom_taxon_name', 'unranked clade_taxon_id',\n", + " 'unranked clade_taxon_name'],\n", + " dtype='object')" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "PI_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "1cb7cb6e-a838-41b3-afdd-23ec1b008679", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['taxon_group', 'verbatim_name', 'name', 'genus modifier', 'genus name',\n", + " 'species modifier', 'species name', 'subspecies modifier',\n", + " 'subspecies name', 'non-taxa descriptor', 'pbdb_taxon_id',\n", + " 'pbdb_taxon_name', 'pbdb_taxon_rank', 'family_taxon_id',\n", + " 'family_taxon_name', 'order_taxon_id', 'order_taxon_name',\n", + " 'class_taxon_id', 'class_taxon_name', 'phylum_taxon_id',\n", + " 'phylum_taxon_name', 'kingdom_taxon_id', 'kingdom_taxon_name',\n", + " 'genus_taxon_id', 'genus_taxon_name', 'unranked clade_taxon_id',\n", + " 'unranked clade_taxon_name', 'subclass_taxon_id', 'subclass_taxon_name',\n", + " 'Any taxon above genus', 'Any taxon above genus modifier', 'Comment',\n", + " 'Corrections to pbdb_taxon', 'Notes (change to Internal only notes?)',\n", + " 'comments', 'subgenera modifier', 'subgenera name', 'species_taxon_id',\n", + " 'species_taxon_name'],\n", + " dtype='object')" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unapproved_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "c8dddbff-f7ae-48e6-9f76-d558e8091133", + "metadata": {}, + "outputs": [], + "source": [ + "old_cols = set(unapproved_df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "517972eb-5666-4fe6-b6aa-fef3d6cd9b1a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "39" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "unapproved_df = unapproved_df.reindex(columns=[\n", + " 'taxon_group', \n", + " 'verbatim_name', 'name',\n", + " 'Comment', 'Notes (change to Internal only notes?)',\n", + " 'Any taxon above genus modifier', 'Any taxon above genus', \n", + " 'genus modifier', 'genus name',\n", + " 'subgenera modifier', 'subgenera name',\n", + " 'species modifier', 'species name', \n", + " 'subspecies modifier','subspecies name', \n", + " 'non-taxa descriptor', \n", + " 'comments',\n", + " 'pbdb_taxon_id', 'pbdb_taxon_name', 'pbdb_taxon_rank', \n", + " 'Corrections to pbdb_taxon',\n", + " 'species_taxon_id', 'species_taxon_name',\n", + " 'genus_taxon_id', 'genus_taxon_name',\n", + " 'family_taxon_id', 'family_taxon_name', \n", + " 'order_taxon_id', 'order_taxon_name',\n", + " 'subclass_taxon_id', 'subclass_taxon_name',\n", + " 'class_taxon_id', 'class_taxon_name', \n", + " 'phylum_taxon_id', 'phylum_taxon_name', \n", + " 'kingdom_taxon_id', 'kingdom_taxon_name',\n", + " 'unranked clade_taxon_id', 'unranked clade_taxon_name', \n", + "])\n", + "len(unapproved_df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "3934d892-93f8-40c6-9fc2-6a6a7d58df6b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "set()" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "old_cols - set(unapproved_df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "c94d8ac1-880f-412b-96bb-90a57c66a027", "metadata": {}, "outputs": [], "source": [ - "merged_df.to_csv(taxa_pbdb_path, index=False)" + "unapproved_df.to_csv(unapproved_taxa_path, index=False)" ] }, { "cell_type": "code", "execution_count": null, - "id": "2c30699f", + "id": "85d46b3d-e2c7-4f5f-9aa7-20d2fdd41074", "metadata": {}, "outputs": [], "source": []