Skip to content

Commit

Permalink
update project solutions for the notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
nanjiangshu committed Nov 13, 2024
1 parent d352f45 commit dd80539
Showing 1 changed file with 48 additions and 36 deletions.
84 changes: 48 additions & 36 deletions assignment/Solutions_project.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 5,
"metadata": {},
"outputs": [
{
Expand All @@ -30,7 +30,7 @@
"source": [
"# Open the reference file (fasta) and loop through it, removing the first line (starts with >)\n",
"# Sum the length of the lines, removing the newline character\n",
"f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"length = 0\n",
"for line in f:\n",
" if not line.startswith(\">\"):\n",
Expand All @@ -48,7 +48,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 6,
"metadata": {},
"outputs": [
{
Expand All @@ -62,7 +62,7 @@
"source": [
"# Open the GTF file and loop throught it.\n",
"# To find the genes, check the 3rd element of each line and add 1 if the feature is \"gene\"\n",
"g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"genes = 0\n",
"for line in g:\n",
" if not line.startswith(\"#\"):\n",
Expand Down Expand Up @@ -96,7 +96,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 7,
"metadata": {},
"outputs": [
{
Expand All @@ -112,7 +112,7 @@
"# To find the number of transcripts, check the 3rd element for \"transcript\" while the specific gene exists\n",
"# Can also be solved with bash using: \n",
"# cat Homo_sapiens.GRCh38.93.gtf | grep ENSG00000001626 | grep \"\\ttranscript\\t\" | wc -l\n",
"g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"\n",
"# Find number of transcripts\n",
"transcripts = 0\n",
Expand Down Expand Up @@ -151,7 +151,7 @@
"# Find all the instances of transcripts for specific gene\n",
"# For each of them, extract the transcript id (column 8) and the length (column 4 - column 3 + 1)\n",
"# If longer than previous longest, store it and continue\n",
"g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"\n",
"# Find longest transcript\n",
"longest_transcript = (0, '') # update the list with length and transcript ID with a for loop\n",
Expand Down Expand Up @@ -179,7 +179,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 8,
"metadata": {},
"outputs": [
{
Expand All @@ -197,7 +197,7 @@
"# Save the whole sequence from the fasta file to a variable\n",
"# and get the start-1 to end from this variable \n",
"# (incl. its start and end positions)\n",
"g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"\n",
"for line in g:\n",
" if not line.startswith(\"#\"):\n",
Expand All @@ -213,7 +213,7 @@
"g.close()\n",
"\n",
"# Extract the longest transcript sequence from the genome fasta file\n",
"f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"\n",
"# Store the DNA sequence of chromosome 7 in a variable\n",
"seqList = []\n",
Expand Down Expand Up @@ -242,7 +242,7 @@
},
{
"cell_type": "code",
"execution_count": 50,
"execution_count": 9,
"metadata": {},
"outputs": [
{
Expand All @@ -267,7 +267,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 10,
"metadata": {},
"outputs": [
{
Expand All @@ -284,7 +284,7 @@
"# Create a variable from the other file, like in the previous question\n",
"# Loop through the set in the other file and pick the positions of the exons\n",
"# Get the longest transcript and its exons\n",
"g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"\n",
"exons = []\n",
"\n",
Expand All @@ -302,7 +302,7 @@
"g.close()\n",
"\n",
"# Extract all exons of the longest transcript sequence from the genome fasta file\n",
"f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"\n",
"seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
"for line in f:\n",
Expand Down Expand Up @@ -332,7 +332,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand Down Expand Up @@ -372,7 +372,7 @@
],
"source": [
"# Get the longest transcript incl. its start and stop codon positions\n",
"g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"\n",
"for line in g:\n",
" if not line.startswith(\"#\"):\n",
Expand All @@ -389,7 +389,7 @@
"g.close()\n",
"\n",
"# Extract all start and stop codon of the longest transcript sequence from the genome fasta file\n",
"f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"\n",
"seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
"for line in f:\n",
Expand Down Expand Up @@ -419,7 +419,7 @@
},
{
"cell_type": "code",
"execution_count": 77,
"execution_count": 13,
"metadata": {},
"outputs": [
{
Expand All @@ -434,7 +434,7 @@
],
"source": [
"# Get the longest transcript and its exons, as well as the start codon\n",
"g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"\n",
"exons = []\n",
"\n",
Expand All @@ -460,7 +460,7 @@
"g.close()\n",
"\n",
"# Extract all exons of the longest transcript sequence from the genome fasta file\n",
"f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"\n",
"seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
"for line in f:\n",
Expand Down Expand Up @@ -498,7 +498,7 @@
},
{
"cell_type": "code",
"execution_count": 78,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand All @@ -522,20 +522,25 @@
},
{
"cell_type": "code",
"execution_count": 81,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The sequence of patient 3 has a different length than the reference genome sequence.\n"
"ename": "FileNotFoundError",
"evalue": "[Errno 2] No such file or directory: 'Patient1.fa'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mFileNotFoundError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[15], line 51\u001b[0m\n\u001b[1;32m 49\u001b[0m i \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m1\u001b[39m \u001b[38;5;66;03m# index for loop through patient fasta files\u001b[39;00m\n\u001b[1;32m 50\u001b[0m \u001b[38;5;28;01mwhile\u001b[39;00m i \u001b[38;5;241m<\u001b[39m \u001b[38;5;241m6\u001b[39m:\n\u001b[0;32m---> 51\u001b[0m p \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mopen\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mPatient\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mstr\u001b[39m(i) \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m.fa\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124m'\u001b[39m)\n\u001b[1;32m 52\u001b[0m patientSeqList \u001b[38;5;241m=\u001b[39m [] \u001b[38;5;66;03m# store the DNA sequence of the patient in a list\u001b[39;00m\n\u001b[1;32m 53\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m p:\n",
"File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/IPython/core/interactiveshell.py:286\u001b[0m, in \u001b[0;36m_modified_open\u001b[0;34m(file, *args, **kwargs)\u001b[0m\n\u001b[1;32m 279\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m file \u001b[38;5;129;01min\u001b[39;00m {\u001b[38;5;241m0\u001b[39m, \u001b[38;5;241m1\u001b[39m, \u001b[38;5;241m2\u001b[39m}:\n\u001b[1;32m 280\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\n\u001b[1;32m 281\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mIPython won\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mt let you open fd=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfile\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m by default \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 282\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mas it is likely to crash IPython. If you know what you are doing, \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 283\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124myou can use builtins\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124m open.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 284\u001b[0m )\n\u001b[0;32m--> 286\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m io_open(file, \u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n",
"\u001b[0;31mFileNotFoundError\u001b[0m: [Errno 2] No such file or directory: 'Patient1.fa'"
]
}
],
"source": [
"# Get the longest transcript and its exons, as well as the start codon\n",
"g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"\n",
"exons = []\n",
"\n",
Expand All @@ -559,7 +564,7 @@
"g.close()\n",
"\n",
"# Extract all exons of the longest transcript sequence from the genome fasta file\n",
"f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"\n",
"seqList = [] # store the DNA sequence of chromosome 7 in a list\n",
"for line in f:\n",
Expand Down Expand Up @@ -613,24 +618,24 @@
},
{
"cell_type": "code",
"execution_count": 95,
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The sequence of patient 3 has a different length than the reference genome sequence.\n"
"The sequence of patient 1 has a different length than the reference genome sequence.\n"
]
}
],
"source": [
"from Bio import SeqIO\n",
"from Bio.Seq import Seq\n",
"from Bio.Alphabet import IUPAC\n",
"\n",
"\n",
"# Get the longest transcript and its exons, as well as the start codon\n",
"g = open(\"Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"g = open(\"data/Homo_sapiens.GRCh38.93.gtf\", 'r')\n",
"\n",
"exons = []\n",
"\n",
Expand All @@ -655,7 +660,7 @@
"\n",
"\n",
"# Extract all exons of the longest transcript sequence from the genome fasta file\n",
"f = open(\"Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"f = open(\"data/Homo_sapiens.GRCh38.dna_sm.chromosome.7.fa\", 'r')\n",
"\n",
"records = list(SeqIO.parse(f, \"fasta\")) # BioPython\n",
"if records[0].id == \"7\": # BioPython\n",
Expand All @@ -678,7 +683,7 @@
"\n",
"i = 1 # index for loop through patient fasta files\n",
"while i < 6:\n",
" p = \"Patient\" + str(i) + \".fa\"\n",
" p = \"data/Patient\" + str(i) + \".fa\"\n",
" patientRecords = list(SeqIO.parse(p, \"fasta\")) # BioPython\n",
" if patientRecords[0].id == \"7\": # BioPython\n",
" patientSeq = patientRecords[0].seq # BioPython\n",
Expand All @@ -695,11 +700,18 @@
"\n",
" i += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -713,7 +725,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.5"
"version": "3.11.0"
}
},
"nbformat": 4,
Expand Down

0 comments on commit dd80539

Please sign in to comment.