-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_neand_hu.py
155 lines (128 loc) · 9.07 KB
/
count_neand_hu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import glob
import pandas as pd
import os
from datetime import datetime
import re
start_time = datetime.now()
#FUNCTIONS
#******************************************************************************
#verify how the dataframes look like, what kind of columns they have
def verif_print(possible_list_of_dataframes):
if isinstance(possible_list_of_dataframes, list) ==True:
for count in range(len(possible_list_of_dataframes)):
return print(f'There are {len(possible_list_of_dataframes)} files',
f'\n{possible_list_of_dataframes[count].head(5)}\n',
f'DataFrame{count} dimensions/axes: {possible_list_of_dataframes[count].ndim}',
f'DataFrame{count} rows :{possible_list_of_dataframes[count].shape[0]}',
f'DataFrame{count} columns :{possible_list_of_dataframes[count].shape[1]}',
sep='\n', end='\n*******************\n')
elif isinstance(possible_list_of_dataframes, list) ==False:
return print(f'{possible_list_of_dataframes.head(5)}\n',
f'Dimensions/axes of the reference file: {possible_list_of_dataframes.ndim}',
f'Number of genomic positions (=rows): {possible_list_of_dataframes.shape[0]}',
f'Number of columns: {possible_list_of_dataframes.shape[1]}',
sep='\n', end='\n*******************\n')
# define the path to the files you use
def open_path(file_name):
path_ref=os.path.join(os.getcwd(), file_name)
return path_ref
# open reference file and name columns
def input_file(file_name):
with open(open_path(file_name)) as my_fileRef:
my_ref = pd.read_csv(my_fileRef, sep='\t', header=None, dtype='object', na_filter= False)
my_ref.columns = ['chr', 'position', 'base_orig', 'base_deriv', 'group', 'empty'] #giving names to the columns to make it easier to $
return my_ref
# evaluate the results you get
def evaluate(df_N, df_hu):
if df_N.shape[0] < df_hu.shape[0]:
return print(f'Neand:{df_N.shape[0]} - Hu:{df_hu.shape[0]} \nThe individual is most likely to be related to modern humans', end='\n*******************\n')
elif df_N.shape[0] > df_hu.shape[0]:
return print(f'\nNeand:{df_N.shape[0]} - Hu:{df_hu.shape[0]} \nThe individual is most likely to be related to Neanderthals', end='\n*******************\n')
else:
return print(f'\nNeand:{df_N.shape[0]} - Hu:{df_hu.shape[0]} \nEqual nr of sequences', end='\n*******************\n')
# function to determine which base to choose if there are multiple reads with different results
def most_common_base(my_list):
counts,values = pd.Series(my_list).value_counts().values, pd.Series(my_list).value_counts().index
df_results = pd.DataFrame(list(zip(values,counts)),columns=["value","count"])
#print(f'the df for counts: {df_results}', end='\n******************\n')
find_max_val = df_results['count'].max(axis=0, skipna = True) #axis=0 takes the column!
most_common_baseS = df_results.loc[(df_results['count'] == find_max_val), 'value'] # df.loc[mask, column]
#print(f'Max value is \n {find_max_val}', end='\n******************\n')
#print(f'The most common base is(are) \n {most_common_baseS}', end='\n******************\n')
#print(f'df_results is \n {df_results}', end='\n******************\n')
if len(most_common_baseS) == 1:
return most_common_baseS[0]
elif len(most_common_baseS) > 1:
random_sampled = most_common_baseS.sample(n=1, axis=0) #Number of items from axis to randomly return
return random_sampled.iloc[0]
# clean the 'base_read' column and find out the base that was read
def base_stripper(row):
matchesList = re.findall('[ACGT]{1}', row['base_read'], flags=re.IGNORECASE) #strips each row of the col of non-letters
#print(matchesList)
matchesList = [item.upper() for item in matchesList] #make all uppercase
if matchesList and len(matchesList) == 1: #one base #means empty list; For sequences, (strings, lists, tuples), empty sequences are false.
#row['base_read'] = matchesList[0]
return matchesList[0]
elif matchesList and len(matchesList) > 1: #2 different bases, or same but different case
base=most_common_base(matchesList)
return base
elif not matchesList: #(if list 'a' is empty, 'not a' is True)
return row['base_reference']
# CODE
#******************************************************************************
# open reference file and check it
file_ref='new/duplicatedNOT_rows_tab_5k'
my_ref=input_file(file_ref)
#print(f'The reference file:')
#my_ref_verif=verif_print(my_ref)
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# read in all necessary '.mpileup' files as a list of dataframes
df = [pd.read_csv(files, sep='\t', dtype='object') for files in glob.glob(open_path('new/jar*'))]
# extract the name of each file to use later
file_name = [os.path.basename(f) for f in glob.glob(os.path.join(os.getcwd(), 'new/jar*'))]
#print(file_name)
# loop through the sequence files and get needed data
for a in range(len(df)):
# name each column of the sequence file
df[a].columns = ["chr", "position", "base_reference", " nr_reads", "base_read", "quality"] #giving names to the columns to make it easier to work with
print(f'\n The DataFrame from sequence file "{file_name[a]}":')
#df_verif=verif_print(df[a])
# merge the reference file and one of the sequence file to get all needed data in one dataframe
# this reduces the length of the file because it searches for rows with common ('chr','position')
common_df=pd.merge(left=my_ref[['chr','position','base_orig', 'base_deriv', 'group']], right=df[a][['chr','position', "base_reference","base_read"]], how='inner', on=['chr','position'])
print(f'"{file_name[a]}": The dataframe with the common values (reference file and sequence):', f'{common_df.head(5)}', f'{common_df.head(5)}', sep='\n', end='\n******************\n')
# drop na rows and those that only have one reading of '*'
# replace missing data with "*" (not needed if data was checked for missing data and NA)
common_df=common_df.replace('^\s*$', '*', regex=True)
print(f'"{file_name[a]}": The common dataframe after replacing missing data with "*":', f'{common_df.shape}', f'{common_df.head(5)}', sep='\n', end='\n***************\n')
indexNames = common_df[common_df['base_read'] == r'*'].index #drop all rows in column 'base_read' that strictly have the value '*'
common_df.drop(indexNames, inplace=True)
print(f'"{file_name[a]}": The common dataframe after drop of "*" rows:', f'{common_df.shape}', f'{common_df.head(5)}', sep='\n', end='\n*************\n')
#strip the 'base_read' column and assign the final base for each position
common_df['base_read'] = common_df.apply(lambda row: base_stripper(row), axis=1)
print(f'"{file_name[a]}": common dataframe after final filtering', f'{common_df.shape}', f'{common_df.head(5)}', sep='\n', end='\n***********\n')
#get all the rows for which ('base_deriv','base_read') are identical
mask=common_df[['base_deriv','base_read']].nunique(axis=1)==1 #on the df made out of the columns ('base_deriv','base_read') taken together,
#I need to work only on the columns where these 2 columns have the same value (the sequenced nucleotide and the one in reference are the same)
#so find the rows where it is true (==1) that the column values (axis =1) are equal
# count only the rows you are interested in: check the next 2 blocks of code:
df_N = common_df[mask & (common_df['group']=='Neandertal')] #now only save the rows where the 'group' values are only 'Nearderthal'
print(f'"{file_name[a]}": The dataframe with the results only for Neanderthal', f'{df_N.head(5)}', f'{df_N.shape}', sep='\n', end ='\n********************\n')
df_hu = common_df[mask & (common_df['group']=='human')]
print(f'"{file_name[a]}": The dataframe with the results only for human', f'{df_hu.head(5)}', f'{df_hu.shape}', sep='\n', end ='\n*************\n')
print(f'The results, wihout Denisovan contribution: ')
e=evaluate(df_N, df_hu)
# delete this part if NOT interested in counting the Denisova contribution to the final result
# same as before
df_ND = common_df[mask & ((common_df['group']=='Neandertal-Denisova') | (common_df['group']=='Neandertal'))]
df_huD = common_df[mask & ((common_df['group']=='Human-Denisova') | (common_df['group']=='human'))]
print(f'The results, when considering Denisovan contribution:')
e_D=evaluate(df_ND, df_huD)
end_time=datetime.now()
Duration=end_time-start_time #gives you a timedelta object!
print ('Duration: {}'.format(end_time - start_time))
# !!!plays a sound when the script is done
# must have 'sox' installed (sudo apt install sox)
duration = 1 # seconds
freq = 440 # Hz
os.system('play -nq -t alsa synth {} sine {}'.format(duration, freq))