update project solutions for the notebook

NBISweden · Nov 13, 2024 · dd80539 · dd80539
1 parent d352f45
commit dd80539
Showing 1 changed file with 48 additions and 36 deletions.
diff --git a/assignment/Solutions_project.ipynb b/assignment/Solutions_project.ipynb
@@ -16,7 +16,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
    "metadata": {},
    "outputs": [
     {
@@ -30,7 +30,7 @@
    "source": [
     "# Open the reference file (fasta) and loop through it, removing the first line (starts with >)\n",
     "# Sum the length of the lines, removing the newline character\n",
-    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
+    "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
     "length = 0\n",
     "for line in f:\n",
     "    if not line.startswith(\">\"):\n",
@@ -48,7 +48,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [
     {
@@ -62,7 +62,7 @@
    "source": [
     "# Open the GTF file and loop throught it.\n",
     "# To find the genes, check the 3rd element of each line and add 1 if the feature is \"gene\"\n",
-    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
+    "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
     "genes = 0\n",
     "for line in g:\n",
     "    if not line.startswith(\"#\"):\n",
@@ -96,7 +96,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -112,7 +112,7 @@
     "# To find the number of transcripts, check the 3rd element for \"transcript\" while the specific gene exists\n",
     "# Can also be solved with bash using: \n",
     "# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep \"\\ttranscript\\t\" | wc -l\n",
-    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
+    "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
     "\n",
     "# Find number of transcripts\n",
     "transcripts = 0\n",
@@ -151,7 +151,7 @@
     "# Find all the instances of transcripts for specific gene\n",
     "# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)\n",
     "# If longer than previous longest, store it and continue\n",
-    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
+    "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
     "\n",
     "# Find longest transcript\n",
     "longest_transcript = (0, '') # update the list with length and transcript ID with a for loop\n",
@@ -179,7 +179,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 8,
    "metadata": {},
    "outputs": [
     {
@@ -197,7 +197,7 @@
     "# Save the whole sequence from the fasta file to a variable\n",
     "# and get the start-1 to end from this variable \n",
     "# (incl. its start and end positions)\n",
-    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
+    "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
     "\n",
     "for line in g:\n",
     "    if not line.startswith(\"#\"):\n",
@@ -213,7 +213,7 @@
     "g.close()\n",
     "\n",
     "# Extract the longest transcript sequence from the genome fasta file\n",
-    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
+    "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
     "\n",
     "# Store the DNA sequence of chromosome 7 in a variable\n",
     "seqList = []\n",
@@ -242,7 +242,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 50,
+   "execution_count": 9,
    "metadata": {},
    "outputs": [
     {
@@ -267,7 +267,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 10,
    "metadata": {},
    "outputs": [
     {
@@ -284,7 +284,7 @@
     "# Create a variable from the other file, like in the previous question\n",
     "# Loop through the set in the other file and pick the positions of the exons\n",
     "# Get the longest transcript and its exons\n",
-    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
+    "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
     "\n",
     "exons = []\n",
     "\n",
@@ -302,7 +302,7 @@
     "g.close()\n",
     "\n",
     "# Extract all exons of the longest transcript sequence from the genome fasta file\n",
-    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
+    "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
     "\n",
     "seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
     "for line in f:\n",
@@ -332,7 +332,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 11,
    "metadata": {},
    "outputs": [
     {
@@ -372,7 +372,7 @@
    ],
    "source": [
     "# Get the longest transcript incl. its start and stop codon positions\n",
-    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
+    "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
     "\n",
     "for line in g:\n",
     "    if not line.startswith(\"#\"):\n",
@@ -389,7 +389,7 @@
     "g.close()\n",
     "\n",
     "# Extract all start and stop codon of the longest transcript sequence from the genome fasta file\n",
-    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
+    "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
     "\n",
     "seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
     "for line in f:\n",
@@ -419,7 +419,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 77,
+   "execution_count": 13,
    "metadata": {},
    "outputs": [
     {
@@ -434,7 +434,7 @@
    ],
    "source": [
     "# Get the longest transcript and its exons, as well as the start codon\n",
-    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
+    "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
     "\n",
     "exons = []\n",
     "\n",
@@ -460,7 +460,7 @@
     "g.close()\n",
     "\n",
     "# Extract all exons of the longest transcript sequence from the genome fasta file\n",
-    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
+    "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
     "\n",
     "seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
     "for line in f:\n",
@@ -498,7 +498,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 14,
    "metadata": {},
    "outputs": [
     {
@@ -522,20 +522,25 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 81,
+   "execution_count": 15,
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "The sequence of patient  3  has a different length than the reference genome sequence.\n"
+     "ename": "FileNotFoundError",
+     "evalue": "[Errno 2] No such file or directory: 'Patient1.fa'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mFileNotFoundError\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[15], line 51\u001b[0m\n\u001b[1;32m     49\u001b[0m i \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;66;03m# index for loop through patient fasta files\u001b[39;00m\n\u001b[1;32m     50\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m i \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m6\u001b[39m:\n\u001b[0;32m---> 51\u001b[0m     p \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPatient\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(i) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.fa\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m     52\u001b[0m     patientSeqList \u001b[38;5;241m=\u001b[39m [] \u001b[38;5;66;03m# store the DNA sequence of the patient in a list\u001b[39;00m\n\u001b[1;32m     53\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m p:\n",
+      "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py:286\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m    279\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m    280\u001b[0m     \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m    281\u001b[0m         \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    282\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    283\u001b[0m         \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m    284\u001b[0m     )\n\u001b[0;32m--> 286\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m io_open(file, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
+      "\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Patient1.fa'"
      ]
     }
    ],
    "source": [
     "# Get the longest transcript and its exons, as well as the start codon\n",
-    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
+    "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
     "\n",
     "exons = []\n",
     "\n",
@@ -559,7 +564,7 @@
     "g.close()\n",
     "\n",
     "# Extract all exons of the longest transcript sequence from the genome fasta file\n",
-    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
+    "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
     "\n",
     "seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
     "for line in f:\n",
@@ -613,24 +618,24 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 95,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "The sequence of patient  3  has a different length than the reference genome sequence.\n"
+      "The sequence of patient  1  has a different length than the reference genome sequence.\n"
      ]
     }
    ],
    "source": [
     "from Bio import SeqIO\n",
     "from Bio.Seq import Seq\n",
-    "from Bio.Alphabet import IUPAC\n",
+    "\n",
     "\n",
     "# Get the longest transcript and its exons, as well as the start codon\n",
-    "g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
+    "g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
     "\n",
     "exons = []\n",
     "\n",
@@ -655,7 +660,7 @@
     "\n",
     "\n",
     "# Extract all exons of the longest transcript sequence from the genome fasta file\n",
-    "f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
+    "f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
     "\n",
     "records = list(SeqIO.parse(f, \"fasta\")) # BioPython\n",
     "if records[0].id == \"7\":  # BioPython\n",
@@ -678,7 +683,7 @@
     "\n",
     "i = 1 # index for loop through patient fasta files\n",
     "while i < 6:\n",
-    "    p = \"Patient\" + str(i) + \".fa\"\n",
+    "    p = \"data/Patient\" + str(i) + \".fa\"\n",
     "    patientRecords = list(SeqIO.parse(p, \"fasta\")) # BioPython\n",
     "    if patientRecords[0].id == \"7\":  # BioPython\n",
     "        patientSeq = patientRecords[0].seq # BioPython\n",
@@ -695,11 +700,18 @@
     "\n",
     "    i += 1"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
@@ -713,7 +725,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.8.5"
+   "version": "3.11.0"
   }
  },
  "nbformat": 4,