diff --git a/CHANGELOG.rst b/CHANGELOG.rst index e885756..5a9ad35 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -6,6 +6,12 @@ dcicwrangling Change Log ---------- +2.4.0 +===== + +`PR:107 add useful notebook #15 to add opf collections to esets _` + +* added a new useful notebook that allows you to use a lab submitted processed file sheet to link replicate sets to other processed files collections for that set 2.3.0 ===== diff --git a/notebooks/useful_notebooks/15_add_opf_collections_to_sets.ipynb b/notebooks/useful_notebooks/15_add_opf_collections_to_sets.ipynb new file mode 100644 index 0000000..1c452c5 --- /dev/null +++ b/notebooks/useful_notebooks/15_add_opf_collections_to_sets.ipynb @@ -0,0 +1,127 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Uses information from a submitter PF spreadsheet to add files to the appropriate place in indicated linked items\n", + "### Currently for ExperimentSets opfs but should be extend to Experiments, Pubs and possibly pages\n", + "\n", + "#### Setup\n", + "\n", + "- Provide a title and description to use for the opf section.\n", + "- indicate path to the workbook" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from dcicutils import ff_utils\n", + "from functions.notebook_functions import *\n", + "from functions.wfr import *\n", + "\n", + "# get key from keypairs.json\n", + "my_key = get_key('andyprod')\n", + "\n", + "# set title here\n", + "opf_type = 'supplementary'\n", + "opf_title = \"Analysis results provided by the data submitters - the Diao Lab\"\n", + "opf_desc = \"The results were generated by the Diao lab using the code available at https://github.com/jianhong/hicar/releases/tag/2.0.0rc\"\n", + "\n", + "# location of excel processed file sheet\n", + "xcel_file = '/Users/andrew/Documents/work/4DN_Metadata/Diao_Yarui_lab/HiCAR_RNAseq_myoblast_diff/230830_hicar_processed_results_ajs_upd.xlsx'\n", + "xcel, sheets = digest_xlsx(xcel_file)\n", + "xcel_data = reader(xcel)\n", + "\n", + "fieldnames = next(xcel_data)\n", + "\n", + "# create a dict with dataset 2 processd files \n", + "dset2opfs = {}\n", + "for row in xcel_data:\n", + " if row[0].startswith('#'):\n", + " continue\n", + " data = dict(zip(fieldnames, row))\n", + " data = {k: v for k, v in data.items() if v}\n", + " file_alias = data.get('aliases')\n", + " dataset = data.get('# linked_datasets')\n", + " dset2opfs.setdefault(dataset, []).append(file_alias)\n", + " \n", + "print(dset2opfs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# go through the dictionary and create a patch for the dataset \n", + "# need to check and maintain other opfs\n", + "\n", + "# if True do the action, if false just report\n", + "action = True\n", + "\n", + "for ds, opfs in dset2opfs.items():\n", + " opf_obj = {'type': opf_type, 'title': opf_title, 'files': opfs}\n", + " if opf_desc:\n", + " opf_obj['description'] = opf_desc\n", + " \n", + " dset = ff_utils.get_metadata(ds, my_key, add_on='frame=raw')\n", + " curr_opfs = dset.get('other_processed_files', [])\n", + " if curr_opfs:\n", + " curr_titles = [i.get('title') for i in curr_opfs]\n", + " if opf_title in curr_titles:\n", + " print('ERROR: {} has been used as a title already for {} - NO GO!'.format(ds, opf_title))\n", + " continue\n", + " # in this specific case we want to insert the new one into\n", + " # the first position of the list\n", + " new_opf_grps = [opf_obj] + curr_opfs\n", + " \n", + " patch_data = {'other_processed_files': new_opf_grps}\n", + " if action:\n", + " res = ff_utils.patch_metadata(patch_data, dset['uuid'], key = my_key)\n", + " print(res)\n", + " else:\n", + " print(\"\\n\")\n", + " print(\"{}\\t{}\".format(dset.get('uuid'), dset.get('accession')))\n", + " print(opf_obj)\n", + " print(\"\\n\\n\")\n", + " print(patch_data)\n", + " \n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index 8871d79..53c1934 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "dcicwrangling" -version = "2.3.0" +version = "2.4.0" description = "Scripts and Jupyter notebooks for 4DN wrangling" authors = ["4DN-DCIC Team "] license = "MIT"