MSA_Reader.py

#The path to the biopython packages was not automatically added to anaconda. To allow import of the package, a manual path must be specified.
import os
#Absolute location: 

#os.environ['CONDA_PREFIX']=/home/wish1832/anaconda3
#Code to append biopython location to path
os.environ['PATH'] = os.environ['PATH']+';'+os.environ['CONDA_PREFIX']+"/pkgs/biopython-1.78-py38h7b6447c_0/lib/python3.8/site-packages"

from Bio import AlignIO as msa_in
from Bio.Align import MultipleSeqAlignment as msa
import Bio.SeqRecord
import numpy as np
import pandas as pd

def initalize_file(outname):
    """
    outname = the directory and prefix of the output file (string)
        For example, setting outname "mydir/mutant" will create an outfile in the directory mydir with the name mutant_variants_raw.tsv.
        Other reports generated by the output will also exist in the directory mydir and have the prefix "mutant".
    
    Given an output file prefix, initialize_files() will create one file (outname_variants_raw.tsv) 
    containing raw data on the substitutions, insertions, and deletions relative to the reference genome found in the sequence alignment. 
    
    The function will print headers in each file in the order specified below:
    
    Field 1: Cluster_ID (e.g. clust48)  
    Field 2: Number of sequences in cluster 
    Field 3: Type of variant (insertion, deletion, substitution)
    Field 4: Mutation code (formatted based on the nomenclature indicated by the Human Genome Variation Society https://varnomen.hgvs.org/recommendations/protein/)
    Field 5: Amino Acid(s) of Reference Strand 
        For insertions, this is the two amino acid residues the insertion ocurrs between.
        For deletions, this will be the reference residue(s) deleted 
    Field 6: Variant Amino Acid Residue
        For multi-codon insertions, this field will contain a string of multiple codons
        For deletions, this will be blank
    Field 7: Start position of variant on multiple sequence alignment
        For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
    Field 8: End position of variant on multiple sequence alignment 
        For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
        For substitutions and single-base deletions this will be blank
    Field 9: Start position of variant on reference strand 
        For insertions this will be the codon number on the reference strand before the inserted codon(s)
    Field 10: End position of variant on reference strand
        For insertions this will be the codon number on the reference strand before the inserted codon(s)
        For substitutions and single-base deletions this will be blank
    """
    #Create filename for raw output
    raw_out = outname+"_variants_raw.tsv"
    #Define header according to elements listed in docstring
    header = ["Cluster_ID","Cluster_Size","Type","Code","Ref Residue(s)","Var Residue(s)","AA_Start(MSA)","AA_End(MSA)","AA_Start(Ref)","AA_End(Ref)"]

    #Print header in output file
    with open(raw_out,"w") as outfile:
        outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(header[0],header[1],header[2],header[3],header[4],header[5],header[6],header[7],header[8],header[9]))

    """Code below will prevent overwriting in final version.
        if not file:
            with open(file,"w") as file
        else
        
            print("Error: {} already exists. Overwriting prohibited.".format(file))"""
    #Return the filename for use in other functions
    return raw_out

def write_line(outfile,clustname,clustsize,var_type,code,ref_AA,var_AA,msa_start,ref_start,msa_end="-",ref_end="-"):
    """
    Will write out to the variant .tsv values given the location of an output list with 6 Fields in the following format:
    
    Field 1: Cluster_ID (e.g. clust48)  
    Field 2: Number of sequences in cluster 
    Field 3: Type of variant (insertion, deletion, substitution)
    Field 4: Mutation code (formatted based on the nomenclature indicated by the Human Genome Variation Society https://varnomen.hgvs.org/recommendations/protein/)
    Field 5: Amino Acid(s) of Reference Strand 
        For insertions, this is the two amino acid residues the insertion ocurrs between.
        For deletions, this will be the reference residue(s) deleted 
    Field 6: Variant Amino Acid Residue
        For multi-codon insertions, this field will contain multiple columns
        For deletions, this will be blank
    Field 7: Start position of variant on multiple sequence alignment
        For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
    Field 8: End position of variant on multiple sequence alignment 
        For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
        For substitutions and single-base deletions this will be blank
    Field 9: Start position of variant on reference strand 
        For insertions this will be the codon number on the reference strand before the inserted codon(s)
    Field 10: End position of variant on reference strand
        For insertions this will be the codon number on the reference strand before the inserted codon(s)
        For substitutions and single-base deletions this will be blank
    """
    #Write to output file
    if outfile:
        with open(outfile,"a") as file:
            file.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(clustname,clustsize,var_type,code,ref_AA,var_AA,msa_start,msa_end,ref_start,ref_end))
    
    #Raise a FileNotFoundError if the output file has not been initialized
    else:
        raise FileNotFoundError("{} does not exist.".format(outfile))        

def reference_position(reference):
    """
    For Variant Code Creation, the positions must correspond to the reference sequence, rather than the alignent.
    
    Given the reference genome, reference_position() will return a list of integers that stores the reference genome position corresponding to the msa position.
    If there is no base in the reference genome at a given position in the msa (when a "-" is in the reference genome), a blank is stored in this list.
    """
    ref_list=[]
    pos=1
    for i in reference:
        if i != "-":
            ref_list.append(pos)
            pos += 1
        else:
            ref_list.append("-")
    return ref_list        
        
def msa_iterator(alignment,raw_out,reference,ref_index,outfile):
    """
    alignment=a biopython multiple sequence alignment object
    raw_out=filename for outputting raw data
    reference=Reference strand to which sequences are compared (biopython RefSeq object)
    ref_list=A list relating an amino acid position on a multiple sequence alingment to its corresponding position on the reference strand
    i_begin=Amino acid position on multiple sequence alignment at which to begin iteration (allows for skipping over multi-base insertions and deletions)
    j_begin=Alignment object at which to begin
    
    
    msa_iterator() will iterate through the multiple sequence alignment object and compare bases to the reference genome. 
    Insertions, deletions, and substitutions will be detected and handled by separate functions based on type. 
    
    i is the amino acid position, j is the sequence position in the MSA (origional iterated through AA position first)
    """
    #Every sequence in the alignment will be iteratively compared to the reference strand
    #Multi-base deletions and insertions require a loop in which iterables can be edited
    #A while loop will be used instead of a for loop to iterate through all i codons in j genomes.
    
    #Begin with i and j equal to zero
    i,j=0,0
    
    #While loop will run until the last i position of the last j strain is tested.
    while not (j==len(alignment)-1 and i==len(reference)):
        #Status report: when at the first index of a genome, print progress (current genome/#of genomes in msa)
        if i==0:
            print("\rExamining genome {0} of {1}. Percent complete {2:.0%}".format(j+1,len(alignment),j/(len(alignment)-1)),end="")
        #When all i codons in a genome have been iterated through, the genome index is advanced by one and the i is reset to zero.
        if i==len(reference):
            j+=1
            i=0
        else:
            #For every codon, test the below conditionals
            #Conditional to detect a substitution
            if reference[i] != alignment[j,i] and reference[i]!="-" and alignment[j,i]!="-":
                substitution_case(i,j,alignment,reference,ref_index,outfile)
                #Move on to next codon
                i+=1

            #Conditional to detect a deletion
            elif (reference[i] != alignment[j,i]) and alignment[j,i]=="-":
                #Call the deletion_case function, which will check for a multi-base deletion and return the value i to resume iteration from 
                i=deletion_case(j,i,alignment,reference,ref_index,outfile)

            #Conditional to detect an insertion
            elif reference[i] != alignment[j,i] and reference[i]=="-":
                i=insertion_case(j,i,alignment,reference,ref_index,outfile)
                #insertion_case will set i to the next codon automatically
            
            #If there is no variant, increase i by one to test the next codon.
            elif reference[i]==alignment[j,i]:
                i+=1
            
            #Test for unanticipated scenarios
            else:
                print("A codon comparison has been found that does not meet any conditionals in the msa_iterator() loop.")
        
def substitution_case(i,j,alignment,reference,ref_index,outfile):
    """
    Will activate when a substution is detected during iteration through the multiple sequence alignment. 
    """
    #Store position of variant in MSA (codon position is one more than the index)
    msa_pos=i+1
    #Store position of variant in reference genome
    ref_pos=ref_index[i]
    #Store codon in position i of reference genome
    ref_AA=reference[i]
    #Store the variant codon
    var_AA=alignment[j,i] #Retrieves the codon from the sequence at position j, at position i
    #Store the name of the cluster associated with the variant sequence (sequence index j)
    #The cluster name is in the id of the RefSeq object, before the ";".
    clustname=alignment[j].id.split(sep=";")[0]
    #Retrieve size of cluster associated with the variant sequence j
    #The cluster size is printed betweeen ";" and "=" in id
    clustsize=alignment[j].id.split(sep="=")[1].split(sep=";")[0]
    #Store the substitution code
    #Based on the sequence variant nomenclature suggested by the Human Genome Variation Society,
    #The code format is <Reference_Amino_Acid><Codon_Position(Reference Genome)><Variant_Amino_Acid>
    #Source (https://varnomen.hgvs.org/recommendations/protein/variant/substitution/)
    code="{}{}{}".format(ref_AA,ref_pos,var_AA)
    
    #Write above information to file
    write_line(outfile,clustname,clustsize,"sub",code,ref_AA,var_AA,msa_pos,ref_pos,"-","-")
    
    #Once information is stored, return to iteration at the next base (returns to the For loop in the parent function)

def deletion_case(j,i,alignment,reference,ref_index,outfile):
    """
    Will activate when a deletion is detected during iteration through the multiple sequence alignment. 
    
    Since deletions may span two or more bases and nomenclature counts multi-base deletions as a single event, 
    iteration through the multiple sequence alignment in the parent msa_iterator() function must be skipped ahead
    to the end of the detion region to avoid a multiple-codon deletion being read as multiple instances of a single-base deletion.
    After recording a deletion, msa_iterator() is restarted at the position i corresponding to the codon after the end of the deletion. 
    """
    #Raise an error if the function is triggered by a codon that is not deleted
    if not (alignment[j,i]=="-" and reference[i]!="-"):
        raise ValueError("Deletion case called on a codon region without a deletion")
    #Or if the deletion case is not called at the beginning of a deletion region
    #Code detects a deletion in the codon one index before the current i it is called on
    elif i!=0 and ((reference[i-1]!=alignment[j,i-1]) and alignment[j,i-1]=="-"): 
        raise ValueError("Deletion case called in the middle of a multi-base deletion")

    #Record the position at the beginning of a deletion
    i_start=i   
   
    #Step 1: Iterate forward until extant codons are found on both the reference strand and genome j.
    while True:
        #If a deletion case is detected, or if both the reference genome and genome j have a blank at index i, try the next codon.
        if ((alignment[j,i]=="-" and reference[i]!="-") or (alignment[j,i]=="-" and reference[i]=="-")):
            #If the base currently being tested at the end of the genome, a special case is triggered 
            if i==len(reference)-1:
                #Set i_Cflank as being non-existent (affects which index is returned to msa_iterator() at end of function)
                i_Cflank=None
                break #Break the while loop and continue to step 2.
            else:
                i+=1 #Increase index by one and continue to test the next codon 
        
        #When the C-flanking extant codon is found, record the index of the flank.
        elif (alignment[j,i]!="-" and reference[i]!="-"):
            #The index of the C-flanking codon will be returned to msa_iterator() as the position to resume variant checking.
            i_Cflank=i
            #Break the while loop and continue to step 2.
            break 
            
        #Special case: it is possible for an insertion to ocurr with a deletion (delins event).
        elif alignment[j,i]!="-" and reference[i]=="-":
            #Run the delins case and store the index of the C-flanking codon to the event
            i=delins_case(j,i_start,alignment,reference,ref_index,outfile)
            #msa_iterator() will continue testing for variants at the index i
            return i

    #Step 2: Iterate backwards over any blanks that may exist until the end of the deletion region is reached.
    #Iteration will begin at the C-flanking codon (or the end of the genome) and will include the start of the deletion region.
    for i in range((i_Cflank if i_Cflank else len(reference)-1),i_start-1,-1):
        #If a blank is discovered at the position, continue to previous codon.
        if (reference[i]=="-" and alignment[j,i]=="-"):
            continue
        
        #If a deletion is found, stop iteration and record current position as i_end (end of deletion region).
        elif (reference[i]!="-" and alignment[j,i]=="-"):
            i_end=i
            #Break the for loop and continue to step 3.
            break
    
    #Step 3: Record deletion information using i_start and i_end.  
    record_deletion_info(j,i_start,i_end,alignment,reference,ref_index,outfile)
    
    #Return the index of the C-flanking codon to msa_iterator() if it exists. 
    #For a deletion at the end of the genome, return i=len(reference), which will cause msa_iterator() to skip to the next line.
    return (i_Cflank if i_Cflank else len(reference))
        
def insertion_case(j,i,alignment,reference,ref_index,outfile):
    """
    insertion_case() will gather the following information regarding the insertion in 5 steps:
    
    instype=Either "ins", "N_ext", or "C_ext". instype will tell the function record_insertion_info() which
        type of insertion case to record. Insertions at the 5' end of the transcriptome are N-terminal extentions,
        and must be recorded differently from standard insertions. The same is true of C-terminal extentions, which
        are insertions at the 3' end of the transcriptome.
        
        If N-terminal or C-terminal extentions have not been discovered, instype will be set to "ins".
        
    i_start=The index of the start of the insertion region (first location where reference is blank and the variant genome has a codon)
    
    i_Nflank= The index of the flanking N-terminal codon to the insertion in the reference strand (required for the code).
        For N-terminal extentions, this will be equal to "None".
    
    i_end=The index of the end of the insertion region (last location where the reference is blank and the variant genome has a codon)
    
    i_Cflank=The index of the flanking C-terminal codon to the insertion in the reference strand (required for the insertion code).
        For C-terminal extentions, this will be equal to "None".

    """
    instype=None
    
    #Raise an error if the function is triggered by a codon that is not inserted
    if not (alignment[j,i]!="-" and reference[i]=="-"):
        raise ValueError("Deletion case called on a codon region without a insertion")
    #Or if the insertion case is not called at the beginning of a deletion region
    #Code detects a insertion in the codon one index before the current i it is called on
    elif i!=0 and ((reference[i-1]!=alignment[j,i-1]) and reference[i-1]=="-"): 
        raise ValueError("Deletion case called in the middle of a multi-base insertion")
    
    #Step 1: Record the index at beginning of insertion
    i_start=i
    
    #Begin iteration
    #Step 2: It is not necessarily true that the flanking reference codon is one index before the insertion start.
    #A for loop will iterate backwards until an extant reference codon is found. 
    #If the insertion region is discovered in the first codon position, a special case is triggered.
    if i!=0:
        for idx in range(i-1,-1,-1): #Iterates backwards from the start of the insertion region, ending at 0
            #idx used instead of i to prevent overwriting of i to be passed to be msa_iterator
            if reference[idx]=="-":
                #Special case, if an N-terminal flanking codon has not been discovered when the beginning of the genome
                #is reached, this is an N-terminal extention.
                if idx==0:
                    i_Nflank=None
                    break #Break the loop and continue to step 3.
            
            #If the N-terminal flanking codon is found, store the index of the codon.
            elif reference[idx]!="-":
                i_Nflank=idx
                break #Break the loop and continue to step 3.
    
    #Step 2: Special case of insertion at beginning of region 
    #Called when an insertion is found starting at i=0 (impossible for a N-terminal flank to exist)
    else:
        #The index of the N-terminal flank is set to "none"
        i_Nflank=None
        
    #Step 3: Iterate forward until the first position of an extant codon on both the reference and the variant strand 
    #Will yield i_Cflank 
    while True: 
        #Check for an insertion, or a blank on both the reference and the genome j, at the codon position. If so, continue on
        if (reference[i]=="-" and alignment[j,i]!="-") or (reference[i]=="-" and alignment[j,i]=="-"):
            #Special case: end of the genome is reached and contains a blank (will also be triggered if the 
            #insertion begins at the end of the variant genome). This is a C-terminal extension.
            if i==len(reference)-1:
                i_Cflank=None
                break #Terminate the loop and continue to step 4  
            
            #If not at the end of the genome, increase the index by one and test the next codon.
            else:
                i+=1

        #Check for a deletion at the codon. If so, switch to the indel case
        elif reference[i]!="-" and alignment[j,i]=="-":
            #i_start will be passed to the indel function
            i=delins_case(j,i_start,alignment,reference,ref_index,outfile)
            return i #After delins case is ran, return the position of the C-terminal flank if it exists to the msa_iterator() function

        #Check for extant codons in both reference and genome j at the position
        elif reference[i]!="-" and alignment[j,i]!="-":
            #The position of the C-terminal flanking codon will be stored.
            i_Cflank=i
            break #Terminate the loop and continue to step 4.
    
    #Step 4: Iterate backward over any blank codons on both the reference and variant strands until a codon on the variant strand is reached
    #Iteration will begin one codon before the C-terminal flank if it exists, otherwise iteration begins at the end of the genome.
    for idx in range((i_Cflank-1 if i_Cflank else len(reference)-1),(i_Nflank if i_Nflank else -1),-1): #Iterates backwards to i_Nflank (excluding i_Nflank) if it exists, otherwise iterates to zero(inclusive).
        #if a blank is found, continue backwards iteration 
        if alignment[j,idx]=="-":
            continue
        
        #The first codon found in the alignment will mark the end of the inserted sequence in the multiple sequence alignment.
        elif alignment[j,idx]!="-":
            i_end=idx
            break #Break the loop and continue to step 5.
    
    #Step 5: determine type of insertion based on the presence/absence of N-terminal and C-terminal flanking codons
    if i_Cflank and i_Nflank:
        instype="ins"
    elif i_Cflank and not i_Nflank:
        instype="N_ext"
    elif i_Nflank and not i_Cflank:
        instype="C_ext"
    
    #All indeces have been gathered. The indeces will be passed to the record_insertion_info() function to record the insertion in the file
    record_insertion_info(j,instype,i_start,i_end,alignment,reference,ref_index,outfile,i_Nflank,i_Cflank)
    #Recording complete. Pass index of the C-flanking codon back to msa_iterator() if it exists 
    return (i_Cflank if i_Cflank else len(reference))

def delins_case(j,i_start,alignment,reference,ref_index,outfile):
    """
    insertion_case() will gather the following information regarding the insertion-deletion (indel) event in 5 steps:
    
    i_start: The beginning of the insertion region. This is passed to the function by the function that triggered the delins case.
    
    i_end: The index of the end of the indel region (last location where the reference is blank and the variant genome has a codon)
    
    i_first: The index of the first codon deleted (used for computation of the variant code)
    
    i_last: The index of the last codon deleted (used for computation of the variant code)
    
    codons_inserted: The amino acid codons that have been inserted into the deletion region, with any blanks between residues removed.  
    """
    #Step 1: Iterate until the C-terminal codon of the delins region is found
    #This will be when there is an extant codon on both the reference strain and on genome j.
    i=i_start
    while True:
        #Test the current codon.
        #If the current codon is an insertion, deletion, or a double blank, continue iteration.
        if (reference[i]!="-" and alignment[j,i]=="-") or (reference[i]=="-" and alignment[j,i]!="-") or (reference[i]=="-" and alignment[j,i]=="-"):
            #Special case: the end of the genome is reached before a C-terminal flank is found
            if i==len(reference)-1:
                i_Cflank=None
                #Break the while loop and continue to step 2
                break
            else:
                i+=1
                
        #If there are codons on the reference and alignment genomes at the current position, the end of the indel region has been reached.
        elif (reference[i]!="-" and alignment[j,i]!="-"):
            #Store this position as the position of the C-terminal flanking codon to the indel region (iteration in msa_iterator() must resume from this point.) 
            i_Cflank=i
            break #Break the while loop and continue to step 2.
            
    #Step 2: Iterate backward over any blanks that may exist until the last codon that is either inserted or deleted
    #Record this position as i_end
    for i in range((i_Cflank if i_Cflank else len(reference)-1),i_start-1,-1):
        #If a blank is discovered at the position, continue to previous codon.
        if (reference[i]=="-" and alignment[j,i]=="-"):
            continue
        
        #If a deletion or an insertion is found, stop iteration and record current position as i_end (end of indel region).
        elif (reference[i]!="-" and alignment[j,i]=="-") or (reference[i]=="-" and alignment[j,i]!="-"):
            i_end=i
            break
    
    #Step 3: Iterate through the indel region on the reference strand, storing the indeces of the first and last codons
    #The first and last codons on the reference strand are the first and last amino acids deleted (necessary for code).
    #Iteration is through region from i_start up to and including i_end.
    
    #Step 3.1: Finding the first codon
    for i in range(i_start,i_end+1):
        if (reference[i]=="-"):
            continue
        elif (reference[i]!="-"):
            #Store the index of the first codon
            i_first=i
            break
    
    #Step 3.2: Finding the last codon
    for i in range(i_end,i_start-1,-1):
        if (reference[i]=="-"):
            continue
        elif (reference[i]!="-"):
            #Store the index of the last codon
            i_last=i
            break
            
    #Step 4: Iterate through the slice of the variant strand and store the codons, removing any blanks 
    #The function insertion_var_AA() will do this.
    codons_inserted=AA_blank_remover(i_start,i_end,j,alignment)
    
    #February 14th, 2020
    #Special Case: amino acid inserted is the same as the amino acids deleted (not really an indel)
    #if i_first==i_last and reference[i_first]==codons_inserted:
        #
    
    #Step 5: Record information using the indeces discovered.
    record_delins_info(i_start,i_end,i_first,i_last,j,codons_inserted,alignment,reference,ref_index,outfile)
    
    #Must return the position of the C-flanking codon to the indel region, so msa_iterator() does not record the region more than once.
    return (i_Cflank if i_Cflank else len(reference)) #If the end of the genome is reached, pass that index to msa_iterator().
    
def record_deletion_info(j,i_start,i_end,alignment,reference,ref_index,outfile):
    """
    Information recording
    """
    #Store the name of the cluster associated with the variant sequence (sequence index j)
    #The cluster name is in the id of the RefSeq object, before the ";".
    clustname=alignment[j].id.split(sep=";")[0]
    #Retrieve size of cluster associated with the variant sequence j
    #The cluster size is printed betweeen ";" and "=" in id
    clustsize=alignment[j].id.split(sep="=")[1].split(sep=";")[0]

    #Conditional: for single codon deletions, positions and amino acid residues will be stored as a single entry
    #For multi-codon deletions, data will be stored as a range
    #For single-codon deletions
    if i_start==i_end:
        #Store position of variant in MSA
        msa_start=i_end+1
        msa_end="-"
        #Store position of variant in reference genome
        ref_start=ref_index[i_end]
        ref_end="-"
        #Store codon in position i of reference genome
        ref_AA=reference[i_end]
        #Store the variant codon
        var_AA="-" #Retrieves the codon from the sequence at position j, at position i    
        #Store the deletion code
        """
        Based on the sequence variant nomenclature suggested by the Human Genome Variation Society,
        the code format is <Amino_Acid_Deleted><Position_of_Deleted_Codon, reference Genome>del
        source: (https://varnomen.hgvs.org/recommendations/protein/variant/deletion/)
        """
        code="{}{}del".format(ref_AA,ref_start)
    #For multi-codon deletions
    else:
        #Store position range in MSA
        msa_start=i_start+1
        msa_end=i_end+1

        #Store position range in reference genome
        ref_start=ref_index[i_start]
        ref_end=ref_index[i_end]

        #Store reference amino acids that have been deleted in sequence j
        #Slicing does not include the end value, so the end value should be one index after the end of the deletion region (i)
        ref_AA=deletion_ref_AA(i_start,i_end,reference)

        #Store a blank for deleted variant codons
        var_AA="-"
        """
        Store the deletion code
        
        Based on the sequence variant nomenclature suggested by the Human Genome Variation Society,
        the code format is <Beginning_Amino_Acid><Beginning_Codon_Position, reference Genome>_<Ending_Amino_Acid><Ending_Codon_Position>del
        source: (https://varnomen.hgvs.org/recommendations/protein/variant/deletion/)
        """
        code="{}{}_{}{}del".format(reference[i_start],ref_index[i_start],reference[i_end],ref_index[i_end])
    
    #Write above information to file
    write_line(outfile,clustname,clustsize,"del",code,ref_AA,var_AA,msa_start,ref_start,msa_end,ref_end)
    
def record_insertion_info(j,instype,i_start,i_end,alignment,reference,ref_index,outfile,i_Nflank=None,i_Cflank=None):
    #Store the name of the cluster associated with the variant sequence (sequence index j)
    #The cluster name is in the id of the RefSeq object, before the ";".
    clustname=alignment[j].id.split(sep=";")[0]
    #Retrieve size of cluster associated with the variant sequence j
    #The cluster size is printed betweeen ";" and "=" in id
    clustsize=alignment[j].id.split(sep="=")[1].split(sep=";")[0]
    
    #Store starting position on the multiple sequence alignment
    msa_start=i_start+1
    ref_AA="-"
    
    #msa_end and var_AA: depend on whether the insertion is a single base or a multi-base insertion 
    if i_start==i_end:
        msa_end="-"
        var_AA=alignment[j,i_start]
    else:
        msa_end=i_end+1
        var_AA=AA_blank_remover(i_start,i_end,j,alignment)
    
    #ref_start and ref_end: defined as the reference positions of the N- and C- terminal flanking codons if they exist
    ref_end=(ref_index[i_Cflank] if i_Cflank else "-") #C-flank is the flank downstream of the insertion.
    ref_start=(ref_index[i_Nflank] if i_Nflank else "-")
    
    #The flanking reference residues are stored for creation of the code, if they exist.
    Nflank=(reference[i_Nflank] if i_Nflank else None)
    Cflank=(reference[i_Cflank] if i_Cflank else None)
 
    
    #The "type" and code fields depend on whether the insertion is a normal case, an N-terminal extension, or a C-terminal extension.
    #TODO: instype will be passed through for output. Delete this later.
    if instype=="ins":
        print_type=instype
        #Code is formatted as <N_flank amino acid ><>_<><>ins<var_AA>
        code="{}{}_{}{}ins{}".format(Nflank,ref_start,Cflank,ref_end,var_AA)
        
    elif instype=="N_ext":
        #print type stores the type to be printed to the output file, as well as the type indicated in the code
        #Both N-terminal and C-terminal extentions will be represented as "ext"
        print_type="ext"
        #Code: will be formatted <C-flanking amino acid>1ext<Var_AA inserted>
        #TODO: This is at odds with the Human Genome Variation Society, which reccomends that extentions describe the number of new codons instead of their type.
        #I chose to print the type because we don't have the genetic information in this experiment. Should I keep things this way or change the format to match the reccomendations?
        code="{}1ext{}".format(Cflank,var_AA)
        
    elif instype=="C_ext":
        print_type="ext"
        #Code: will be formatted <N-flanking amino acid><reference position of N-flanking amino acid>ext<Var_AA inserted>
        code="{}{}ext{}".format(Nflank,ref_start,var_AA)
    
    #Write infomation to file 
    write_line(outfile,clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,ref_start,msa_end,ref_end)
    
def record_delins_info(i_start,i_end,i_first,i_last,j,codons_inserted,alignment,reference,ref_index,outfile):
    ###Record cluster information###
    #Store the name of the cluster associated with the variant sequence (sequence index j)
    #The cluster name is in the id of the RefSeq object, before the ";".
    clustname=alignment[j].id.split(sep=";")[0]
    #Retrieve size of cluster associated with the variant sequence j
    #The cluster size is printed betweeen ";" and "=" in id
    clustsize=alignment[j].id.split(sep="=")[1].split(sep=";")[0]
    
    #Record type (always equal to delins)
    print_type="delins"
    
    ###Record indel code###
    #Format: <first codon deleted><position of first codon deleted>_<last codon deleted><position of last codon deeted>delins<AA codons inserted>
    
    #Store identity and position of first codon deleted, using its index
    del_AA_first=reference[i_first]
    #ref_start=Position of first codon deleted
    ref_start=ref_index[i_first]
    
    #Code format depends on if there is one or more codons deleted. 
    #If multiple codons are deleted (i_first!=i_last), calculate the code in the format below.
    if i_first!=i_last:
        #Store identity and position of last codon deleted, using its index
        del_AA_last=reference[i_last]
        #ref_end=Position of last codon deleted
        ref_end=ref_index[i_last]
        #Indel code
        code="{}{}_{}{}delins{}".format(del_AA_first,ref_start,del_AA_last,ref_end,codons_inserted)
    #Follow the conditional below for single-codon deletions.
    else:
        ref_end="-" #will store a blank on the raw data output file.
        code="{}{}delins{}".format(del_AA_first,ref_start,codons_inserted)
        
    ###Record AA sequence on reference and variant strands###
    #ref_AA and var_AA will simply be the reference sequence and the variant sequence in the indel region,
    #Without blanks removed.
    ref_AA=str(reference[i_start:i_end+1])
    var_AA=str(alignment[j,i_start:i_end+1].seq)
    
    #Record start and end positions on multiple sequence alignment (equal to index of start and end of region plus one)
    msa_start=i_start+1
    msa_end=i_end+1
    
    #Write information to file
    write_line(outfile,clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,ref_start,msa_end,ref_end)
    
def deletion_ref_AA(i_start,i_end,reference):
    """
    For multi-base deletions, records the refernce AA's deleted without blanks that may appear in the aligned reference genome.
    
    This function should only be called on multi-base deletions, otherwise an error will occur
    """
    ref_AA=reference[i_start]
    #the amino acid at i_start is included first, and iteration goes up to and including i_end (must be i_end+1 because range() does not include stop value)
    for i in range(i_start+1,i_end+1):
        #If an amino acid exists on the reference genome (no blank on msa), add the AA to the sequence
        if reference[i]!="-":
            ref_AA=ref_AA+reference[i]
    return ref_AA

def AA_blank_remover(i_start,i_end,j,alignment):
    """
    AA_blank_remover() records the amino acids within a specified region on a specified genome j, 
    excluding blanks that may exist due to the multiple sequence alignment.
    
    For example, an insertion reading "FLP--AV" in the multiple sequence alginment will be converted to "FLPAV".
    
    i_start: the starting index of the region (integer)
    i_end: the ending index of the region (integer)
    j: the index of the genome to be read (integer)
    alingment: a biopython multiple sequence alignent object consisting of biopython SeqRecord objects.
    """
    #Begin with an empty variable 
    AA_Seq=None
    #Iterate from the first codon in the region to the last, including both first and last
    for i in range(i_start,i_end+1):
        #Blanks will be skipped by the conditionals below.
        #The first non-blank character will be detected and assigned to AA_seq
        if alignment[j,i]!="-" and not AA_Seq:
            AA_Seq=alignment[j,i]
        #If there is at least one codon already stored in AA_seq, the next codon will be appended to the existing sequence.
        elif alignment[j,i]!="-" and AA_Seq:
            AA_Seq=AA_Seq+alignment[j,i]
    return AA_Seq
                
"""
Part 2: Functions for analysis of raw variant data and report production
"""
def initialize_report_files(outname):
    """
    outname = the directory and file prefix of the output files. Specific files will contain 
    
    intitialize_report_files will create six files containing reports on the mutations present.

    """
    subs_report=outname+"_substitutions.txt"
    ins_report=outname+"_insertions.txt"
    dels_report=outname+"_deletions.txt"
    delins_report=outname+"_indels.txt"
    clusters_report=outname+"_clusters.txt"
    all_report=outname+"_all_by_code.csv"
    count_report=outname+"_variant_counts.csv"
    
    #Create blank files to be filled with information in later steps 
    for file in [subs_report,ins_report,dels_report,delins_report,clusters_report,all_report,count_report]:
        with (open(file,"w")) as outfile:
                outfile.write("")
    return subs_report,ins_report,dels_report,delins_report,clusters_report,all_report,count_report

#Create a class for storing all of the fields in the raw data file created in part 1
class VariantRecord:
    """
    Creates a variant_record object that stores the data for each variant produced in part 1. 
    
    --Instance variables--
    clustname: Cluster_ID
    clustsize: Number of sequences in cluster 
    print_type: Type of variant (insertion, deletion, substitution)
    code: Mutation code (formatted based on the nomenclature indicated by the Human Genome Variation Society https://varnomen.hgvs.org/recommendations/protein/)
    ref_AA: Amino Acid(s) of Reference Strand 
        For insertions, this is the two amino acid residues the insertion ocurrs between.
        For deletions, this will be the reference residue(s) deleted 
    var_AA: Variant Amino Acid Residue
        For multi-codon insertions, this field will contain multiple columns
        For deletions, this will be blank
    msa_start: Start position of variant on multiple sequence alignment
        For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
    msa_end: End position of variant on multiple sequence alignment 
        For insertions this will be the codon number on the multiple sequence alignment before the inserted codon(s)
        For substitutions and single-base deletions this will be blank
    ref_start: Start position of variant on reference strand 
        For insertions this will be the codon number on the reference strand before the inserted codon(s)
    ref_end: End position of variant on reference strand
        For insertions this will be the codon number on the reference strand before the inserted codon(s)
        For substitutions and single-base deletions this will be blank
    """
    def __init__(self,clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,msa_end,ref_start,ref_end):
        self.__clustname=clustname
        self.__clustsize=(int(clustsize) if clustsize!="-" else None) 
        self.__print_type=print_type
        self.__code=code
        self.__ref_AA=ref_AA
        self.__var_AA=var_AA
        self.__msa_start=(int(msa_start) if msa_start!="-" else None) #Blanks may exist in this region in the output file
        self.__msa_end=(int(msa_end) if msa_end!="-" else None)
        self.__ref_start=(int(ref_start) if ref_start!="-" else None)
        self.__ref_end=(int(ref_end) if ref_end!="-" else None)
    #Functions to print each variable    
    def get_clustname(self):
        return self.__clustname
    def get_clustsize(self):
        return self.__clustsize
    def get_print_type(self):
        return self.__print_type
    def get_code(self):
        return self.__code
    def get_ref_AA(self):
        return self.__ref_AA
    def get_var_AA(self):
        return self.__var_AA
    def get_msa_start(self):
        return self.__msa_start
    def get_msa_end(self):
        return self.__msa_end
    def get_ref_start(self):
        return self.__ref_start
    def get_ref_end(self):
        return self.__ref_end
    #Define method to change the code of the record (used when breaking down multi-base deletions in the count file)
    def set_code(self,new_code):
        """
        set_code(new_code)
        
        set_code will alter the code of the variant record object, and is used for processing multi-base
        deletions when setting up the CSV file for variant counts.
        
        new_code=The desired new code (string)
        WARNING: this will alter the VariantRecord object, and can corrupt results if not used properly.
        """
        self.__code=str(new_code)
    #Define output upon calling print() on the VariantRecord object
    def __str__(self):
        return "[{:<25}{:<5}{:<8}{:<15}]".format(self.__clustname,self.__clustsize,self.__print_type,self.__code)
        
def initialize_mutation_database(outname):
    """
    initialize_mutation_database will create a list of VariantRecord objects using the raw data file produced from 
    iteration through the multiple sequence alignment.
    """
    var_database=[]
    with open(outname+"_variants_raw.tsv","r") as file:
        #Load lines
        lines=file.readlines()
        #Exclude header
        for line in lines[1:]:
            #Remove newline characters 
            line=line.strip()
            #Store each field in its corresponding VariantRecord attribute
            clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,msa_end,ref_start,ref_end = line.split("\t")
            #Use attributes from the line to create a VariantRecord object, and add the object to the list.
            var_database.append(VariantRecord(clustname,clustsize,print_type,code,ref_AA,var_AA,msa_start,msa_end,ref_start,ref_end))
    return var_database    

#natsorted package used in the function below
from natsort import natsorted

def info_by_cluster(var_database,clusters_report):
    """
    info_by_cluster() will produce an output file listing the mutations present in each cluster, 
    sorted in ascending order, alphabetically and then numerically, by cluster ID.
    
    var_database: a list of VariantRecord objects created from the intermediate tab-separated file 
        produced by msa_iterator().
    
    clusters_report: the path of the file name for exporting info by cluster, created by 
        initialize_report_files().
    """
    #Get a list of unique cluster IDs
    #A set is used to retrieve the unique cluster IDs (sets store only unique values, with no ordering)
    clust_ids=set()
    for Record in var_database:
        clust_ids.add(Record.get_clustname())
    #Convert set back to a list to allow sorting
    clust_ids=list(clust_ids)
    #Sort based on ascending cluster ID number (requires natural sorting from the natsort package)
    clust_ids=natsorted(clust_ids)
    
    #Print info to outname_clusters.txt
    with open(clusters_report,"a") as file:
        #Print header to file
        print("-"*30,"Variants observed by cluster","-"*30,sep="\n",end="\n",file=file)

        #Iterate through unique cluster IDs, printing all variants associated with each cluster ID
        for clustname in clust_ids:
            #Iteratively subset database for the cluster id
            subset=[record for record in var_database if record.get_clustname()==clustname]
            #Print the header for the cluster id, then the number of sequences, then list the mutation code of each mutation event observed.
            #End the print statement with a double new line to leave one line of blank space between each cluster in the record.
            print(">{}, {} sequences".format(clustname,subset[0].get_clustsize()),*(record.get_code() for record in subset),sep="\n",end="\n\n",file=file)

def info_by_code(var_database,var_type,subs_report,ins_report,dels_report,delins_report):
    """
    info_by_code() will return information on the identity of clusters with a variant, for each unique variant 
    in the multiple sequence alignment. 
    
    var_database: A list of VariantRecord objects generated from the intermediate tab-separated file produced by msa_iterator().
    
    var_type: The type of variants to return information for. Accepts "sub" (substitutions),"ins" (insertions),
        "del"(deletions), and "delins"(insertion-deletions).
    
    subs_report: the path of the file name for exporting substitution info by code, created by 
        initialize_report_files().
        
    ins_report: the path of the file name for exporting insertion info by code, created by 
        initialize_report_files().
    
    dels_report: the path of the file name for exporting deletion info by code, created by 
        initialize_report_files().
        
    delins_report: the path of the file name for exporting insertion-deletion info by code, created by 
        initialize_report_files().
    """
    if var_type=="sub":
        #subset for substitutions
        substitutions=[record for record in var_database if record.get_print_type()=="sub"]

        #Create a set of unique mutation codes
        #Sorting should be performed based on the start position on the multiple sequence alignment. A dictionary is used to sort this
        subcodes={}
        for record in substitutions:
            #Conditional below returns a dictionary of unique mutation codes
            if record.get_code() not in subcodes:
                subcodes[record.get_code()]=record.get_msa_start()
        #Sort the dictionary by value of MSA start position (returns a list of the substitution codes sorted by msa start position)
        subcodes=sorted(subcodes, key=lambda x:subcodes[x])
    
        #Print info to outname_substitutions.txt
        with open(subs_report,"a") as file:
            #Print header to file
            print("-"*40,"Clusters by Mutation Code: Substitutions","-"*40,sep="\n",end="\n",file=file)

            #For each unique code, subset the substitutions database by that code and print statistics
            for code in subcodes:
                #Iteratively subset database for the mutation code
                subset=[record for record in var_database if record.get_code()==code]
                #Calculate total number of sequences with the mutation
                total_seq=0
                for record in subset:
                    #For each cluster with the mutation code, get the size of the cluster and add that to the total number of sequences
                    total_seq+=record.get_clustsize()
                #Print the header for the mutation code, then the total number of sequences with the code
                #Next, the cluster name and number of sequences in the cluster is printed for each cluster with the code 
                #End the print statement with a double new line to leave one line of blank space between each cluster in the record.
                print(">{}, {} sequences".format(code,total_seq),*("{}; {} sequences".format(record.get_clustname(),record.get_clustsize()) for record in subset),sep="\n",end="\n\n",file=file) 

    elif var_type=="ins":
        #Subset for insertions
        insertions=[record for record in var_database if record.get_print_type()=="ext" or record.get_print_type()=="ins"]
        #Create a set of unique mutation codes
        #Sorting should be performed based on the start position on the multiple sequence alignment. A dictionary is used to sort this
        inscodes={}
        for record in insertions:
            #Conditional below returns a dictionary of unique mutation codes
            if record.get_code() not in inscodes:
                inscodes[record.get_code()]=record.get_msa_start()
        #Sort the dictionary by value of MSA start position (returns a list of the insertion codes sorted by msa start position)
        inscodes=sorted(inscodes, key=lambda x:inscodes[x])
        
        #Print information to outname_insertions.txt
        with open(ins_report,"a") as file:
            #Print header to file
            print("-"*40,"Clusters by Mutation Code: Insertions","-"*40,sep="\n",end="\n",file=file)
            
            #For each unique code, subset the insertions database by that code and print statistics            
            for code in inscodes:
                #Iteratively subset database for the mutation code
                subset=[record for record in var_database if record.get_code()==code]
                #Calculate total number of sequences with the mutation
                total_seq=0
                for record in subset:
                    #For each cluster with the mutation code, get the size of the cluster and add that to the total number of sequences
                    total_seq+=record.get_clustsize()
                #Print the header for the mutation code, then the total number of sequences with the code
                #Next, the cluster name and number of sequences in the cluster is printed for each cluster with the code 
                #End the print statement with a double new line to leave one line of blank space between each cluster in the record.
                print(">{}, {} sequences".format(code,total_seq),*("{}; {} sequences".format(record.get_clustname(),record.get_clustsize()) for record in subset),sep="\n",end="\n\n",file=file)

    elif var_type=="del":
        #Subset for deletions
        dels=[record for record in var_database if record.get_print_type()=="del"]

        #Create a set of unique mutation codes
        #Sorting should be performed based on the start position on the multiple sequence alignment. A dictionary is used to sort this
        delcodes={}
        for record in dels:
            #Conditional below returns a dictionary of unique mutation codes
            if record.get_code() not in delcodes:
                delcodes[record.get_code()]=record.get_msa_start()
        #Sort the dictionary by value of MSA start position (returns a list of the deletion codes sorted by msa start position)
        delcodes=sorted(delcodes, key=lambda x:delcodes[x])

        with open(dels_report,"a") as file:
            #Print header to file
            print("-"*40,"Clusters by Mutation Code: Deletions","-"*40,sep="\n",end="\n",file=file)

            #For each unique code, subset the substitutions database by that code and print statistics
            for code in delcodes:
                #Iteratively subset database for the mutation code
                subset=[record for record in var_database if record.get_code()==code]
                #Calculate total number of sequences with the mutation
                total_seq=0
                for record in subset:
                    #For each cluster with the mutation code, get the size of the cluster and add that to the total number of sequences
                    total_seq+=record.get_clustsize()
                #Print the header for the mutation code, then the total number of sequences with the code
                #Next, the cluster name and number of sequences in the cluster is printed for each cluster with the code 
                #End the print statement with a double new line to leave one line of blank space between each cluster in the record.
                print(">{}, {} sequences".format(code,total_seq),*("{}; {} sequences".format(record.get_clustname(),record.get_clustsize()) for record in subset),sep="\n",end="\n\n",file=file) 

    elif var_type=="indel":
        #Subset for indels
        indels=[record for record in var_database if record.get_print_type()=="delins"]

        #Create a set of unique mutation codes
        #Sorting should be performed based on the start position on the multiple sequence alignment. A dictionary is used to sort this
        indelcodes={}
        for record in indels:
            #Conditional below returns a dictionary of unique mutation codes
            if record.get_code() not in indelcodes:
                indelcodes[record.get_code()]=record.get_msa_start()
        #Sort the dictionary by value of MSA start position (returns a list of the insertion-deletion codes sorted by msa start position)
        indelcodes=sorted(indelcodes, key=lambda x:indelcodes[x])
        
        with open(delins_report,"a") as file:
            #Print header to file
            print("-"*40,"Clusters by Mutation Code: Indels","-"*40,sep="\n",end="\n",file=file)
            #For each unique code, subset the substitutions database by that code and print statistics
            for code in indelcodes:
                #Iteratively subset database for the mutation code
                subset=[record for record in var_database if record.get_code()==code]
                #Calculate total number of sequences with the mutation
                total_seq=0
                for record in subset:
                    #For each cluster with the mutation code, get the size of the cluster and add that to the total number of sequences
                    total_seq+=record.get_clustsize()
                #Print the header for the mutation code, then the total number of sequences with the code
                #Next, the cluster name and number of sequences in the cluster is printed for each cluster with the code 
                #End the print statement with a double new line to leave one line of blank space between each cluster in the record.
                print(">{}, {} sequences".format(code,total_seq),*("{}; {} sequences".format(record.get_clustname(),record.get_clustsize()) for record in subset),sep="\n",end="\n\n",file=file) 

def mutation_csv(var_database,all_report,alignment):
    """
    mutation_csv() will produce a comma-separated values file with the following fields:
    
    Position: the position of the mutation on the reference strand. For insertions, this will be the
    Type: The type of mutation (sub,ins,del,indel,ext)
    Code: Mutation code, according to standard nomenclature
    Freq: The total number of sequences with the mutation.
    """
    unique_codes={}
    #First, all mutations will be sorted by the start position in the reference sequence. A dictionary will be used to achieve this
    for record in var_database:
        #A list of unique codes is generated by adding any code that is not already in the unique_codes dictionary.
        if record.get_code() not in unique_codes:
            #N-terminal extensions do not have an entry for ref_start(), so 1 will be used in this case 
            #(insertion ocurrs before first reference position).
            if record.get_print_type()=="ext" and not record.get_ref_start():
                unique_codes[record.get_code()]=1
            #For normal cases (ref_start exists), add the reference position to the dictionary for sorting
            elif record.get_ref_start():
                unique_codes[record.get_code()]=record.get_ref_start() if record.get_ref_start() else 1
            #Throw an error for any unanticipated records without a reference position recorded.
            else:
                raise ValueError("Unanticipated Case Missing Reference Position.")

    #Sort the dictionary of codes by reference position of variant
    unique_codes=sorted(unique_codes,key=lambda x:unique_codes[x])

    #Number of codes written to CSV is stored for progress reporting
    #Set counter to zero initially
    codes_printed=0

    #Write header of CSV to output file:
    with open(all_report,"a") as file:
        file.write("Position,Type,Code,Frequency,Percentage\n")
    
    #Store total number of sequences analyzed for percentage calculation
    total_prot_seq=total_sequences(alignment)
    
    #Subset the database for each unique code and find the reference position, the type of mutation, and the total number of sequences with the mutation
    for code in unique_codes:
        #Subset the database for the code
        subset=[record for record in var_database if record.get_code()==code]
        
        #Store reference position of the mutation. This will be the same for all mutation records with the same code,
        #So the operation is performed on the first element in the database.
        #For N-terminal extensions, get_ref_start() returns "None" because there is no N-terminal flank. 
        #In this case, "1" should be printed to the CSV file instead since the extension concerns the first base
        if subset[0].get_ref_start()==None and subset[0].get_print_type()=="ext":
            ref_start=1
        else:
            ref_start=subset[0].get_ref_start()
        
        #Store type of mutation (again the same for all mutations of the same code)
        print_type=subset[0].get_print_type()
        #Calculate total number of sequences with the mutation
        total_seq=0
        for record in subset:
            #For each cluster with the mutation code, get the size of the cluster and add that to the total number of sequences
            total_seq+=record.get_clustsize()
        
        #Calculated percentages of all sequences analyzed with the variant
        percent="{0:.4%}".format(total_seq/total_prot_seq)
        
        #Write information to CSV
        with open(all_report,"a") as file:
            print(ref_start,print_type,code,total_seq,percent,sep=",",end="\n",file=file)
        codes_printed+=1
        print("\rProgress: {}/{} codes written to file".format(codes_printed,len(unique_codes)),end="")
        
def break_up_del(var_database,reference,ref_index):
    """
    break_up_del will iterate through a list of VariantRecord objects and replace multi-codon deletion records 
    with one record per codon deleted. The output of break_up_del will be used for the creation of the CSV file
    for variant counts.
    
    var_database: a list of VariantRecord objects
    reference: the reference sequence (biopython SeqRecord object as a slice of a multiple sequence alignment)
    ref_index: a list mapping the multiple seq. alignment positions to the reference positions, used for creating the deletion code
    """
    #Set up a list for new records created to avoid corrupting the iteration through var_database
    new_records=[]
    #Iteratre through var_database
    for j in range(len(var_database)):
        print("\rProcessing record {} of {}.".format(j+1,len(var_database)),end="")
        #Search for multi_codon deletion records (print_type="del") and ref_end!="-"
        if var_database[j].get_print_type()=="del" and var_database[j].get_ref_end():
            #Store the record as "first record (will represent the first base deleted)"
            first_record=var_database[j]

            #Break up the multi-codon deltion record by creating a new record for every reference codon deleted,
            #Starting from the second codon that is deleted.
            for i in range(first_record.get_ref_start()+1,first_record.get_ref_end()+1):
                #A new variant record object will be set up, with attributes corresponding to a single-codon deletion at each position.
                #The reference AA at each position is calculated and stored for creation of the new VariantRecord object.
                new_ref_AA=reference[ref_index.index(i)]
                single_del_record=VariantRecord(first_record.get_clustname(),first_record.get_clustsize(),print_type="del",code="{}{}del".format(new_ref_AA,i),ref_AA=new_ref_AA,var_AA="-",msa_start=ref_index.index(i),msa_end="-",ref_start=i,ref_end="-")
                #Add the new single-base deletion to the variant database
                new_records.append(single_del_record)

            #After iterative creation of single-codon deletion records is complete, modify the first record.
            #The first record will store the code using the 1-letter code of the first codon deleted
            first_record.set_code("{}{}del".format(first_record.get_ref_AA()[0],first_record.get_ref_start()))
    #Once iteration through the variant database is complete, add the new records to the variant database.
    for record in new_records:
        var_database.append(record)
    #Move carriage to next line (prevents carriage return in next function from overwriting progress output)
    print()
    #Return the modified var_database
    return var_database

def total_sequences(alignment):
    """
    Iterates through all clusters in a multiple sequence alignment and returns the total number of sequences
    contained in all clusters.
    
    alignment: a list of biopython SeqRecord objects
    """
    total_seq=0
    for j in range(len(alignment)):
        #Iterate through every cluster in the alignment, adding the size of the cluster to the total
        #Cluster size is in the SeqRecord object id, between the equals sign and the semicolon
        total_seq+=int(alignment[j].id.split(sep="=")[1].split(sep=";")[0])
    return total_seq

def length_reference_sequence(ref_index):
    """
    The length of the reference sequence cannot be determined directly from the multiple sequence alignment,
    length_reference_sequence() will calculate the length of the reference sequence by finding the maximum 
    value entered into ref_index.
    """
    for i in range(len(ref_index)-1,0,-1):
        #Iterate backwards from the end of ref_index until a non-blank value is found.
        if ref_index[i]!="-":
            #The first non-blank value is the last codon position in the reference seuqence.
            ref_length=ref_index[i]
            #Return this value as the length of the reference sequence.
            return ref_length

def mutation_count_table(alignment,var_database,reference,ref_index,count_report):
    """
    
    """
    #Run break_up_del() to convert multi-codon deletions to individual deletion records
    var_database=break_up_del(var_database,reference,ref_index)
    #Store the total number of sequences
    total_seq=total_sequences(alignment)
    #Print header for CSV file
    with open(count_report,"a") as file:
        print("Residue_Number","Total_Variants","Percent_Variants","Substitutions","Insertions","Deletions","Extensions","Indels",sep=",",file=file)
    #Determine the length of the reference sequence for the protein
    ref_length=length_reference_sequence(ref_index)
    #Iterate through all reference residues in the sequence (starting at one)
    #For each position, subset the variant events dataset for the position,
    #and add the cluster size of each mutation event to the count of variations for that position.
    for j in range(1,ref_length+1):
        print("\rWriting CSV file: {} of {}.".format(j,ref_length+1),end="")
        #Special case: no variants exist for the given reference codon
        total_seq=total_sequences(alignment)
        if len([record for record in var_database if record.get_ref_start()==j])==0:
            dels=0
            ins=0
            subs=0
            exts=0
            indels=0
            total_variants=0
            percent_variants="0.0000%"
        #Standard case: variant records exist for codon j
        else:
            dels=0
            ins=0
            subs=0
            indels=0
            exts=0
            for record in [record for record in var_database if record.get_ref_start()==j]:
                if record.get_print_type()=="sub":
                    subs+=int(record.get_clustsize())
                elif record.get_print_type()=="del":
                    dels+=int(record.get_clustsize())
                elif record.get_print_type()=="ins":
                    ins+=int(record.get_clustsize())
                elif record.get_print_type()=="ext":
                    exts+=int(record.get_clustsize())
                elif record.get_print_type()=="delins":
                    indels+=int(record.get_clustsize())
            #Calculate the total number of genomes that have a variant codon at position j
            total_variants=ins+subs+dels+indels+exts
            #Calculate the percentage of all genomes entered that have a variant codon at position j.
            percent_variants=total_variants/total_seq
            #Convert percent_variants to a percentage, rounded to three decimal places
            percent_variants="{:.4%}".format(percent_variants)
        #Print data for reference codon j to CSV
        with open(count_report,"a") as file:
            print(j,total_variants,percent_variants,subs,ins,dels,exts,indels,sep=",",file=file)
    print("\rWriting CSV file: {} of {}.".format(ref_length+1,ref_length+1),end="")
            
def main(input_file,output_directory,output_name,reference_genome_location=0):
    """
    Required arguments for main function
    
    input_file=Path to the fasta file containing the multiple sequence alingment
    output_directory=Path to the desired directory for storing the output files. The directory will be created
        if it does not already exist.
    output_name=Identifying prefix to be added to the output files. It is recommended to set this to the name of
        the protein, without any spaces.
    reference_genome_location=The integer index of the cluster in the multiple sequence alignment that contains the 
        reference sequence. Indexing is zero-based, so the first cluster will correspond to index 0. Zero is the
        default value.
    """
    print("Part 1: Reading MSA to Generate Raw Database of Mutation Events")
    #Load MSA, store as biopython multiple sequence alingment object
    alignment = msa_in.read(input_file,'fasta')

    #Initialize files
    #Define output file name
    outdir=output_directory
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    outname=outdir+"/"+output_name
    raw_out = initalize_file(outname)

    #Define reference genome (here it is index 1 of the multiple sequence alignment)
    reference=alignment[reference_genome_location].seq
    #Create reference sequence object storing reference sequence indeces corresponding to msa index (if "-" in reference seq after alignment)
    ref_index=reference_position(reference)

    #Iterate through multiple sequence alignment and write raw data from variant information to file
    import time
    tic=time.time()
    msa_iterator(alignment,raw_out,reference,ref_index,raw_out)
    toc=time.time()
    sec=toc-tic
    print("\nTime elapsed: {:.0f} minutes {:.0f} seconds.".format(sec//60,sec%60))

    #Step 2: Report information from raw file
    tic=time.time()

    var_database=initialize_mutation_database(outname)

    print("Part 2: Text File Output for Each Mutation Type")
    #Initialize output files
    subs_report,ins_report,dels_report,delins_report,clusters_report,all_report,count_report=initialize_report_files(outname)

    #Print mutation events observed in each cluster
    info_by_cluster(var_database,clusters_report)

    #Print substitution events by code
    info_by_code(var_database,"sub",subs_report,ins_report,dels_report,delins_report)

    #Print insertion events by code
    info_by_code(var_database,"ins",subs_report,ins_report,dels_report,delins_report)

    #Print deletion events by code
    info_by_code(var_database,"del",subs_report,ins_report,dels_report,delins_report)

    #Print indel events by code
    info_by_code(var_database,"indel",subs_report,ins_report,dels_report,delins_report)
    toc=time.time()

    sec=toc-tic
    print("Time elapsed: {:.0f} minutes {:.0f} seconds.".format(sec//60,sec%60))

    print("Part 3: CSV generation")
    #Part 3: create csv table of mutation events
    tic=time.time()
    #Print all mutation events by code
    mutation_csv(var_database,all_report,alignment)
    toc=time.time()

    sec=toc-tic
    print("\nTime elapsed: {:.0f} minutes {:.0f} seconds.".format(sec//60,sec%60))

    print("Part 4: Variant Counts CSV file")
    tic=time.time()

    mutation_count_table(alignment,var_database,reference,ref_index,count_report)

    toc=time.time()
    sec=toc-tic
    print("\nTime elapsed: {:.0f} minutes {:.0f} seconds.".format(sec//60,sec%60))

def find_ref_cluster_id(directory):
    """
    Taking a 'where_reference.tsv' produced by the R script in 'reference_finder_script.Rmd', find_ref_cluster_id
    will map the location of the reference cluster for each protein alignment in the target directory
    to allow automatic variant analysis. Used with the Auto_Parse() function.
    
    directory: Path to the target directory containing the msa files and the 'where_reference.tsv' file.
    """
    #The variable below will become "True" when a where_reference file is read. If there are more than one 
    #where_reference files, an error will be returned.
    Already_Read=False
    #Find the 'where_reference.tsv' file in the destination directory
    for file in os.listdir(directory):
        #When found, create the pathway 
        if "where_reference.tsv" in file:
            if Already_Read==True:
                raise FileExistsError("There is more than one 'where_reference.tsv' file in the specified directory.\nPlease re-run with just one 'where_reference.tsv' file.")
            input_file=directory+file
            positions=dict()
            with open(input_file,"r") as file:
                #Create a list with one element for each line in the input file
                lines=file.readlines()
                #Remove newline characters in input
                for i in range(len(lines)):
                    lines[i]=lines[i].strip()
                #For each protein, extract protein name, and the name of the cluster with the ref. genome location in the corresponding protein
                for line in lines:
                    #For each protein, extract protein name, and 
                    protein=line.split(sep="\t")[0]
                    #Ectract cluster name corresponding to location of ref. genome in protein
                    cluster=line.split(sep="\t")[1]
                    #From the cluster name, extract corresponding cluster index in msa object 
                    #Extract cluter number from name (after 'Uniq')
                    cluster=cluster.split(sep="Uniq")[1]
                    #convert number to index: program handles 0-index indeces, but cluster names are 1-indexed
                    ref_index=int(cluster)-1
                    #Add dictionary entry with a key for the protein name, and value for the ref. position index
                    positions[protein]=ref_index
            #Set 'already_read' to True to catch multiple 'where_reference.tsv' files
            Already_Read=True
    #Return the dictionary with ref. index mappings
    return positions
    
def read_msa_all(in_directory,out_directory):
    """
    read_msa_all() will run the MSA program for all multiple sequence alignment files in a specified directory.
    The files in the target directory must end with "msa.fasta" to be parsed by the program. The genomes in 
    the multiple sequence alignment must be clustered with the following pattern for cluster names: 
    "Uniq<cluster_number(base_1)>;size=<number of genomes in cluster>;". This format can be produced using USEARCH  
    
    in_directory: relative or absolute path to the desired input directory
        THE INPUT DIRECTORY MUST CONTAIN THE FILE "where_reference.tsv" PRODUCED BY THE REFERENCE CLUSTER FINDER
        FOR THIS FUNCTION TO WORK PROPERLY.
    out_directory: relative or absolute path to the desired output directory. Sub-directories will be created 
        for each protein within this directory. Out_directory must exist for program to run.
    """
    #Create dictionary with reference cluster IDs for each protein msa being analyzed 
    positions=find_ref_cluster_id(in_directory)
    
    for filename in os.listdir(in_directory):
    #Only run program for files ending in msa.fasta to avoid errors
        if "msa.fasta" in filename:
            #The filename in the iterator does not include the path to the file. The path is added here
            input_file=in_directory+filename
            #The protein name is given at the beginning of the filename, before the first underscore.
            protein=filename.split("_")[0]
            #A directory will be created for each protein in the Variant_Call directory
            output_directory=out_directory+protein
            
            #Fetch cluster ID with reference genome
            reference_genome_location=positions[protein]
            #Print header for the protein
            print("-"*80)
            print(protein,end="\n\n")
            #Run program for file
            #TODO temporary code
            print("input file: ",input_file,sep="")
            print("output directory: ",output_directory,sep="")
            print("Protein Name: ",protein,sep="")
            print("reference genome cluster index: ",reference_genome_location,sep="")
            main(input_file,output_directory,protein,reference_genome_location)