diff --git a/assignment/Solutions_project.ipynb b/assignment/Solutions_project.ipynb index 0034f66..59f7cf4 100644 --- a/assignment/Solutions_project.ipynb +++ b/assignment/Solutions_project.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -30,7 +30,7 @@ "source": [ "# Open the reference file (fasta) and loop through it, removing the first line (starts with >)\n", "# Sum the length of the lines, removing the newline character\n", - "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", + "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "length = 0\n", "for line in f:\n", " if not line.startswith(\">\"):\n", @@ -48,7 +48,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -62,7 +62,7 @@ "source": [ "# Open the GTF file and loop throught it.\n", "# To find the genes, check the 3rd element of each line and add 1 if the feature is \"gene\"\n", - "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", + "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "genes = 0\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", @@ -96,7 +96,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -112,7 +112,7 @@ "# To find the number of transcripts, check the 3rd element for \"transcript\" while the specific gene exists\n", "# Can also be solved with bash using: \n", "# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep \"\\ttranscript\\t\" | wc -l\n", - "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", + "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "# Find number of transcripts\n", "transcripts = 0\n", @@ -151,7 +151,7 @@ "# Find all the instances of transcripts for specific gene\n", "# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)\n", "# If longer than previous longest, store it and continue\n", - "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", + "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "# Find longest transcript\n", "longest_transcript = (0, '') # update the list with length and transcript ID with a for loop\n", @@ -179,7 +179,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -197,7 +197,7 @@ "# Save the whole sequence from the fasta file to a variable\n", "# and get the start-1 to end from this variable \n", "# (incl. its start and end positions)\n", - "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", + "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", @@ -213,7 +213,7 @@ "g.close()\n", "\n", "# Extract the longest transcript sequence from the genome fasta file\n", - "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", + "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "# Store the DNA sequence of chromosome 7 in a variable\n", "seqList = []\n", @@ -242,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -267,7 +267,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -284,7 +284,7 @@ "# Create a variable from the other file, like in the previous question\n", "# Loop through the set in the other file and pick the positions of the exons\n", "# Get the longest transcript and its exons\n", - "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", + "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "exons = []\n", "\n", @@ -302,7 +302,7 @@ "g.close()\n", "\n", "# Extract all exons of the longest transcript sequence from the genome fasta file\n", - "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", + "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "seqList = [] # store the DNA sequence of chromosome 7 in a list\n", "for line in f:\n", @@ -332,7 +332,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -372,7 +372,7 @@ ], "source": [ "# Get the longest transcript incl. its start and stop codon positions\n", - "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", + "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "for line in g:\n", " if not line.startswith(\"#\"):\n", @@ -389,7 +389,7 @@ "g.close()\n", "\n", "# Extract all start and stop codon of the longest transcript sequence from the genome fasta file\n", - "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", + "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "seqList = [] # store the DNA sequence of chromosome 7 in a list\n", "for line in f:\n", @@ -419,7 +419,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -434,7 +434,7 @@ ], "source": [ "# Get the longest transcript and its exons, as well as the start codon\n", - "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", + "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "exons = []\n", "\n", @@ -460,7 +460,7 @@ "g.close()\n", "\n", "# Extract all exons of the longest transcript sequence from the genome fasta file\n", - "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", + "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "seqList = [] # store the DNA sequence of chromosome 7 in a list\n", "for line in f:\n", @@ -498,7 +498,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -522,20 +522,25 @@ }, { "cell_type": "code", - "execution_count": 81, + "execution_count": 15, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "The sequence of patient 3 has a different length than the reference genome sequence.\n" + "ename": "FileNotFoundError", + "evalue": "[Errno 2] No such file or directory: 'Patient1.fa'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[15], line 51\u001b[0m\n\u001b[1;32m 49\u001b[0m i \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;66;03m# index for loop through patient fasta files\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m i \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m6\u001b[39m:\n\u001b[0;32m---> 51\u001b[0m p \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPatient\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(i) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.fa\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 52\u001b[0m patientSeqList \u001b[38;5;241m=\u001b[39m [] \u001b[38;5;66;03m# store the DNA sequence of the patient in a list\u001b[39;00m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m p:\n", + "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py:286\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 281\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 282\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 283\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 284\u001b[0m )\n\u001b[0;32m--> 286\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m io_open(file, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n", + "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Patient1.fa'" ] } ], "source": [ "# Get the longest transcript and its exons, as well as the start codon\n", - "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", + "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "exons = []\n", "\n", @@ -559,7 +564,7 @@ "g.close()\n", "\n", "# Extract all exons of the longest transcript sequence from the genome fasta file\n", - "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", + "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "seqList = [] # store the DNA sequence of chromosome 7 in a list\n", "for line in f:\n", @@ -613,24 +618,24 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The sequence of patient 3 has a different length than the reference genome sequence.\n" + "The sequence of patient 1 has a different length than the reference genome sequence.\n" ] } ], "source": [ "from Bio import SeqIO\n", "from Bio.Seq import Seq\n", - "from Bio.Alphabet import IUPAC\n", + "\n", "\n", "# Get the longest transcript and its exons, as well as the start codon\n", - "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n", + "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n", "\n", "exons = []\n", "\n", @@ -655,7 +660,7 @@ "\n", "\n", "# Extract all exons of the longest transcript sequence from the genome fasta file\n", - "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", + "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n", "\n", "records = list(SeqIO.parse(f, \"fasta\")) # BioPython\n", "if records[0].id == \"7\": # BioPython\n", @@ -678,7 +683,7 @@ "\n", "i = 1 # index for loop through patient fasta files\n", "while i < 6:\n", - " p = \"Patient\" + str(i) + \".fa\"\n", + " p = \"data/Patient\" + str(i) + \".fa\"\n", " patientRecords = list(SeqIO.parse(p, \"fasta\")) # BioPython\n", " if patientRecords[0].id == \"7\": # BioPython\n", " patientSeq = patientRecords[0].seq # BioPython\n", @@ -695,11 +700,18 @@ "\n", " i += 1" ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -713,7 +725,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.11.0" } }, "nbformat": 4,