forked from djangraw/MoodDrift
-
Notifications
You must be signed in to change notification settings - Fork 0
/
AssembleMmiBatches.py
232 lines (206 loc) · 11.4 KB
/
AssembleMmiBatches.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
AssembleMmiBatches.py
Gathers all the batches imported by ImportMmiData_May2020.py, gets info about
them, and saves that info to table Mmi-Batches.csv. Also combines groups of
batches that will later be analyzed as a single group.
Created 6/2/20 by DJ.
Updated 3/31/21 by DJ - adapted for shared code structure.
Updated 4/2/21 by DJ - added overwrite flag.
Updated 4/8/21 by DJ - added line to calculate Recovery(Instructed)1.
Updated 1/3/22 by DJ - added option to include or exclude 'control' batches
collected in 2021.
Updated 8/11/22 by DJ - add MwBeforeAndAfter to AllOpeningRestControls batch
"""
# %% Import packages
import pandas as pd
import numpy as np
import os
from glob import glob
import MoodDrift.Analysis.PlotMmiData as pmd
from MoodDrift.Preprocessing.CombineMmiBatches import CombineMmiBatches
# Get batch names
dataCheckDir = '../Data/DataChecks'
outDir = '../Data/OutFiles'
includeRepeats = False; # should the "AllOpeningRestAndRandom" superbatch include returning subjects?
includeControlsInSuperbatches = False # should AdultOpeningRest and AllOpeningRestAndRandom include relevant controls batches?
overwrite = True # overwrite existing files?
print('Gathering batches...')
batchFiles_glob = glob('%s/*DataCheck.csv'%dataCheckDir)
batchFiles_glob.sort()
print('%d batches found.'%len(batchFiles_glob))
batchFiles = []
batchNames = []
batchStart = []
batchEnd = []
taskFiles = []
nSubjAttempted = []
nSubjCompleted = []
# %%
print('Getting info from DataCheck files...')
for batchFile in batchFiles_glob:
print(' batch %s...'%batchFile)
dfDataCheck = pd.read_csv(batchFile,index_col=0) # read in file
batchName = os.path.basename(batchFile).split('_')[0] # extract name of batch
# Online adolescent participants: split across runs
if 'Nimh' in batchFile:
nRuns = 3
for run in range(1,nRuns+1):
minDate = '2999-01-01'
maxDate = '1999-01-01'
for iLine in range(dfDataCheck.shape[0]):
taskFile = dfDataCheck.loc[iLine,'taskFile_run%d'%run]
if isinstance(taskFile,str) and not taskFile[0].isdigit(): # if it's a string and not '0.0'
minDate = min(minDate,dfDataCheck.loc[iLine,'taskFile_run%d'%run].split('_')[-2])
maxDate = max(maxDate,dfDataCheck.loc[iLine,'taskFile_run%d'%run].split('_')[-2])
batchNames.append('%s-run%s'%(batchName,run))
batchFiles.append(batchFile)
batchStart.append(minDate)
batchEnd.append(maxDate)
# Add number of subjects
nSubjAttempted.append(np.sum(pd.notna(dfDataCheck['taskFile_run%d'%run])))
nSubjCompleted.append(np.sum(pd.notna(dfDataCheck['taskFile_run%d'%run])))
else: # online adult participants: no splitting
minDate = '2999-01-01'
maxDate = '1999-01-01'
for iLine in range(dfDataCheck.shape[0]):
taskFile = dfDataCheck.loc[iLine,'taskFile']
if isinstance(taskFile,str) and not taskFile[0].isdigit(): # if it's a string and not '0.0'
minDate = min(minDate,dfDataCheck.loc[iLine,'taskFile'].split('_')[-2])
maxDate = max(maxDate,dfDataCheck.loc[iLine,'taskFile'].split('_')[-2])
batchNames.append(batchName)
batchFiles.append(batchFile)
batchStart.append(minDate)
batchEnd.append(maxDate)
# Add number of subjects
nSubjAttempted.append(dfDataCheck.shape[0])
nSubjCompleted.append(np.sum(dfDataCheck.isComplete))
# Create Dataframe
print('Creating dataframe...')
maxNBlocks = 4
cols = ['batchName','startDate','endDate','nSubjAttempted','nSubjCompleted',
'dataCheckFile','ratingsFile','trialFile','surveyFile','lifeHappyFile',
'probesFile','pymerInputFile','pymerCoeffsFile','nPreviousRuns','isNimhCohort'] + \
['block%d_type'%run for run in range(maxNBlocks)] + \
['block%d_targetHappiness'%run for run in range(maxNBlocks)] + \
['block%d_nTrials'%run for run in range(maxNBlocks)] + \
['block%d_nRatings'%run for run in range(maxNBlocks)] + \
['block%d_meanDuration'%run for run in range(maxNBlocks)]
nBatches = len(batchNames)
dfBatches = pd.DataFrame(np.ones((nBatches,len(cols)))*np.nan,columns=cols)
dfBatches['batchName'] = batchNames
dfBatches['startDate'] = batchStart
dfBatches['endDate'] = batchEnd
dfBatches['nSubjAttempted'] = nSubjAttempted
dfBatches['nSubjCompleted'] = nSubjCompleted
dfBatches['dataCheckFile'] = batchFiles
dfBatches['ratingsFile'] = ['%s/Mmi-%s_Ratings.csv'%(outDir,batchName) for batchName in batchNames]
dfBatches['trialFile'] = ['%s/Mmi-%s_Trial.csv'%(outDir,batchName) for batchName in batchNames]
dfBatches['surveyFile'] = ['%s/Mmi-%s_Survey.csv'%(outDir,batchName) for batchName in batchNames]
dfBatches['lifeHappyFile'] = ['%s/Mmi-%s_LifeHappy.csv'%(outDir,batchName) for batchName in batchNames]
dfBatches['probesFile'] = ['%s/Mmi-%s_Probes.csv'%(outDir,batchName) for batchName in batchNames]
dfBatches['pymerInputFile'] = ['%s/Mmi-%s_pymerInput.csv'%(outDir,batchName) for batchName in batchNames]
dfBatches['pymerCoeffsFile'] = ['%s/Mmi-%s_pymerCoeffs.csv'%(outDir,batchName) for batchName in batchNames]
dfBatches['nPreviousRuns'] = 0;
dfBatches.loc[['02' in batchName for batchName in batchNames],'nPreviousRuns'] = 1
dfBatches.loc[['run2' in batchName for batchName in batchNames],'nPreviousRuns'] = 1
dfBatches.loc[['03' in batchName for batchName in batchNames],'nPreviousRuns'] = 2
dfBatches.loc[['run3' in batchName for batchName in batchNames],'nPreviousRuns'] = 2
dfBatches['isNimhCohort'] = ['Nimh' in batchName for batchName in batchNames]
# Get block info
print('Getting block info...')
for iBatch,batchName in enumerate(batchNames):
print(' batch %s...'%batchName)
# Load
dfRating = pd.read_csv('%s/Mmi-%s_Ratings.csv'%(outDir,batchName))
dfTrial = pd.read_csv('%s/Mmi-%s_Trial.csv'%(outDir,batchName))
# Get averages
dfRatingMean = pmd.GetMeanRatings(dfRating)
dfTrialMean = pmd.GetMeanTrials(dfTrial)
tBlockSwitch,blockType = pmd.GetBlockTimes(dfTrial,dfRating)
nBlocks = len(blockType)
for iBlock in range(nBlocks):
isThis = (dfTrialMean.iBlock==iBlock)
if (batchName=='Activities') and (iBlock==0):
dfBatches.loc[iBatch,'block%d_type'%iBlock] = 'Activities'
dfBatches.loc[iBatch,'block%d_targetHappiness'%iBlock] = np.nan
dfBatches.loc[iBatch,'block%d_nTrials'%iBlock] = 0
isThis = (dfRatingMean.iBlock==iBlock)
dfBatches.loc[iBatch,'block%d_nRatings'%iBlock] = np.sum(isThis)
dfBatches.loc[iBatch,'block%d_meanDuration'%iBlock] = tBlockSwitch[iBlock+1] - tBlockSwitch[iBlock]
else:
dfBatches.loc[iBatch,'block%d_type'%iBlock] = dfTrialMean.loc[isThis,'trialType'].values[0]
dfBatches.loc[iBatch,'block%d_targetHappiness'%iBlock] = dfTrialMean.loc[isThis,'targetHappiness'].values[0]
dfBatches.loc[iBatch,'block%d_nTrials'%iBlock] = np.sum(isThis)
isThis = (dfRatingMean.iBlock==iBlock)
dfBatches.loc[iBatch,'block%d_nRatings'%iBlock] = np.sum(isThis)
dfBatches.loc[iBatch,'block%d_meanDuration'%iBlock] = tBlockSwitch[iBlock+1] - tBlockSwitch[iBlock]
outFile = '%s/Mmi-Batches.csv'%outDir
print('Writing to %s...'%outFile)
if os.path.exists(outFile) and not overwrite:
print('Not overwriting existing file.')
else:
dfBatches.to_csv(outFile)
print('Done!')
# %% Combine two batches with identical trials
CombineMmiBatches(['Recovery1','RecoveryInstructed1'],'Recovery(Instructed)1');
# %% Assemble batches with no opening rest, short opening rest, and long opening rest
# Reload batch info file
batchFile = '%s/Mmi-Batches.csv'%outDir
print('Reading batch info from %s...'%batchFile)
dfBatches = pd.read_csv(batchFile,index_col=0)
# Get batches matching our description
isNoRestBatch = (dfBatches.nPreviousRuns==0) & \
(dfBatches.block0_type=='closed') & \
((dfBatches.block0_targetHappiness=='1.0') | \
(dfBatches.block0_targetHappiness=='1'))
isShortBatch = (dfBatches.nPreviousRuns==0) & \
(dfBatches.block0_type=='rest') & \
(dfBatches.block1_type=='closed') & \
((dfBatches.block1_targetHappiness=='1.0') | \
(dfBatches.block1_targetHappiness=='1')) & \
(dfBatches.block0_meanDuration<500)
isLongBatch = (dfBatches.nPreviousRuns==0) & \
(dfBatches.block0_type=='rest') & \
(dfBatches.block1_type=='closed') & \
((dfBatches.block1_targetHappiness=='1.0') | \
(dfBatches.block1_targetHappiness=='1')) & \
(dfBatches.block0_meanDuration>=500)
adultopeningrestbatches = ['COVID01', 'Expectation-12min', 'Expectation-7min', 'Numbers',
'Recovery1', 'RecoveryInstructed1Freq0p25',
'RecoveryInstructed1Freq0p5', 'RecoveryInstructed1Freq2',
'RecoveryInstructed1', 'RestDownUp', 'Stability01-Rest']
# if requested, add relevant controls batches to AdultOpeningRest.
# Exclude BeforeAndAfter batches where repeated administration changed the results.
if includeControlsInSuperbatches:
newBatches = ['BoredomAfterOnly','MwAfterOnly','MwBeforeAndAfter']
print(f'Adding control batches {newBatches} to AdultOpeningRest...')
adultopeningrestbatches = adultopeningrestbatches + newBatches
# Create new "superbatches" that combine multiple cohorts
CombineMmiBatches(dfBatches.loc[isNoRestBatch,'batchName'].values,'NoOpeningRest');
CombineMmiBatches(dfBatches.loc[isShortBatch,'batchName'].values,'ShortOpeningRest');
CombineMmiBatches(dfBatches.loc[isLongBatch,'batchName'].values,'LongOpeningRest');
CombineMmiBatches(dfBatches.loc[isShortBatch | isLongBatch,'batchName'].values,'AnyOpeningRest');
CombineMmiBatches(dfBatches.loc[isNoRestBatch | isShortBatch | isLongBatch,'batchName'].values,'AnyOrNoRest');
CombineMmiBatches(adultopeningrestbatches,'AdultOpeningRest');
# %% Assemble batch of all cohorts with opening rest or random-gambling block for use in large-scale LME analysis
# Get batches that match our description
dfBatch = dfBatches[['batchName','ratingsFile','surveyFile','trialFile','lifeHappyFile','block0_type','nPreviousRuns','endDate']]
dfBatch = dfBatch.loc[(dfBatches.block0_type=='rest') | (dfBatches.block0_type=='random'),:]
if not includeRepeats:
print('Excluding repeat participants...')
dfBatch = dfBatch.loc[(dfBatch.nPreviousRuns==0),:]
if not includeControlsInSuperbatches:
print('Excluding control batches collected in 2021 from AllOpeningRestAndRandom...')
dfBatch = dfBatch.loc[(dfBatch.endDate < '2021-01-01'),:]
dfBatch = dfBatch.drop(['Stability01-random','Stability02-random','RecoveryNimh-run3'],axis=0,errors='ignore')
# Create "superbatch" that combines multiple cohorts
CombineMmiBatches(dfBatch['batchName'].values,'AllOpeningRestAndRandom',makeSubjectsMatchPymer=True);
# %% Assemble batch of MW before+after and boredom before+after cohorts
# Get MW cohort
CombineMmiBatches(['MwBeforeAndAfter','MwAfterOnly'],'AllMw',makeSubjectsMatchPymer=True);
# Get boredom cohort
CombineMmiBatches(['BoredomBeforeAndAfter','BoredomAfterOnly'],'AllBoredom',makeSubjectsMatchPymer=True);
# Get combined cohort to replicate results (exclude before-and-after cohorts where repeated administration affected results)
CombineMmiBatches(['MwBeforeAndAfter','MwAfterOnly','BoredomBeforeAndAfter','BoredomAfterOnly'],'AllOpeningRestControls',makeSubjectsMatchPymer=True);