add up to date CCA scripts

nih-fmrif · Oct 5, 2021 · 7cf91c0 · 7cf91c0
1 parent 75bdf91
commit 7cf91c0
Show file tree

Hide file tree

Showing 6 changed files with 940 additions and 0 deletions.
diff --git a/abcd_basic_proc.m b/abcd_basic_proc.m
@@ -0,0 +1,137 @@
+% abcd_basic_proc.m
+% Written by Nikhil Goyal, National Institute of Mental Health, 2019-2020
+% Created: June 2020
+% Modified: 7/1/2020
+
+% CCA processing script for the ABCD connectome and SM data
+% Required inputs are the NET.txt and VARS.txt files.
+
+function abcd_basic_proc(N_perm, N_dim, abcd_cca_dir, n_subs)
+    if nargin<4
+        sprintf("ERROR, not enough arguments.")
+        sprintf("Example: abcd_basic_proc(100000, 70, '/data/ABCD_MBDU/goyaln2/abcd_cca_replication/', 5013)")
+        return
+	end
+
+	%% Add the dependencies folders to the PATH, and read in necessary data
+	if ~isdeployed
+		addpath(genpath(sprintf('%s/dependencies/', abcd_cca_dir)));
+		addpath(genpath(sprintf('%s/data/', abcd_cca_dir)));
+	end
+
+	% Read in data, set some variables, create confounds matrix
+	% VARS_0 = Subjects X SMs text file
+	VARS_0=strcsvread(sprintf('%s/data/%d/VARS.txt', abcd_cca_dir, n_subs));
+
+	% Load the Subjects X Nodes matrix (should be size Nx19900)
+	N0=load(sprintf('%s/data/%d/NET.txt', abcd_cca_dir, n_subs));   
+
+	% Load list of SMs to be used in ICA (this list is made manually)
+	ica_sms_0=fileread(sprintf('%s/data/ica_subject_measures.txt', abcd_cca_dir));
+	ica_sms = strsplit(ica_sms_0);
+
+	% Load list of names of colums used to encode scanner ID
+	scanner_col_names_0=fileread(sprintf('%s/data/%d/scanner_confounds.txt', abcd_cca_dir, n_subs));
+	scanner_col_names = strsplit(scanner_col_names_0);
+
+	% Drop subject col and device serial number col (they are strings)
+	egid_col    = find(strcmpi(VARS_0(1,:),'subjectid'));
+	serial_col  = find(strcmpi(VARS_0(1,:),'mri_info_device.serial.number'));
+	VARS_0(:,[egid_col serial_col])=[];
+
+	% Get column indices of our confound variables
+	[sharedvals,scanner_cols_idx]=intersect(VARS_0(1,:),scanner_col_names);
+	site_col        = find(strcmpi(VARS_0(1,:),'abcd_site'));
+	mri_man_col     = find(strcmpi(VARS_0(1,:),'mri_info_manufacturer'));
+	mean_fd_col     = find(strcmpi(VARS_0(1,:),'mean_fd'));
+	bmi_col         = find(strcmpi(VARS_0(1,:),'anthro_bmi_calc'));
+	weight_col      = find(strcmpi(VARS_0(1,:),'anthro_weight_calc'));
+	wholebrain_col  = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_wholebrain'));
+	intracran_col   = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_intracranialvolume'));
+
+	% Get column indices of the ICA SMs
+	[sharedvals,ica_sms_idx]=intersect(VARS_0(1,:),ica_sms);
+
+	% VARS without column names
+	VARS=cell2mat(VARS_0(2:end,:));
+
+	% --- CREATE CONFOUND MATRIX ---
+	% Set up confounds matrix. Confounds are demeaned, any missing data is set to 0
+	% Confounds are:
+	% 1. Individual scanner IDs (subsitute for abcd_site and mri_info_manufacturer)
+	% 2. mean_fd
+	% 3. anthro_bmi_calc
+	% 4. anthro_weight_calc
+	% 5. smri_vol_subcort.aseg_wholebrain
+	% 6. smri_vol_subcort.aseg_intracranialvolume 
+	% Additional confounds from demeaning and squaring all confounds 
+	conf  = palm_inormal([ VARS(:,scanner_cols_idx) VARS(:,[mean_fd_col bmi_col weight_col]) VARS(:,[wholebrain_col intracran_col]).^(1/3) ]);  % Gaussianise
+	conf(isnan(conf)|isinf(conf)) = 0;                % impute missing data as zeros
+	conf  = nets_normalise([conf conf(:,length(scanner_cols_idx):end).^2]);  % add on squared terms and renormalise (all cols other than those for scanner IDs)
+	conf(isnan(conf)|isinf(conf)) = 0;                % again convert NaN/inf to 0 (above line makes them reappear for some reason)
+
+	% --- SM PROCESSING ---
+	% Matrix S1 (only ICA sms)
+	S1=[VARS(:,ica_sms_idx)];
+
+	% Now, prepare the final SM matrix and run the PCA
+	% S2, formed by gaussianizing the SMs we keep
+	S2=palm_inormal(S1); % Gaussianise
+
+ 	% Now generate S3, formed by deconfounding the 17 confounds out of S2
+	S3=S2;
+	for i=1:size(S3,2) % deconfound ignoring missing data
+		grot=(isnan(S3(:,i))==0);
+		grotconf=nets_demean(conf(grot,:));
+		S3(grot,i)=normalize(S3(grot,i)-grotconf*(pinv(grotconf)*S3(grot,i)));
+	end
+
+	% Determine how much data is missing:
+	sum(sum(isnan(S3)))/(size(S3,1)*size(S3,2))*100
+
+	% Next, we need to generate S4
+	% First, estimate the SubjectsXSubjects covariance matrix (where for any two subjects, SMs missing for either subject are ignored)
+	% The approximate covariance matrix (varsdCOV) is then projected onto the nearest valid covariance matrix using nearestSPD toolbox.
+	% This method avoids imputation, and S4 can be fed into PCA.
+	S3Cov=zeros(size(S3,1));
+	for i=1:size(S3,1) % estimate "pairwise" covariance, ignoring missing data
+		for j=1:size(S3,1)
+			grot=S3([i j],:);
+			grot=cov(grot(:,sum(isnan(grot))==0)');
+			S3Cov(i,j)=grot(1,2);
+		end
+	end
+	S4=nearestSPD(S3Cov); % project onto the nearest valid covariance matrix. This method avoids imputation (we can't have any missing values before running the PCA)
+
+	% Check the before and after correlation:
+	corrcoef(S4,S3Cov)  %0.9999 for the 5013 subject sample
+
+	% Generate S5, the top eigenvectors for SMs, to avoid overfitting and reduce dimensionality
+	[uu,dd]=eigs(S4,N_dim);       % SVD (eigs actually)
+	S5=uu-conf*(pinv(conf)*uu);   % deconfound again just to be safe
+
+	% --- NETMAT PROCESSING ---
+	fprintf("Calculating netmat matrices N1 through N5\n")
+	% N1, formed by 1. demean, 2. globally variance normalize
+	N1=nets_demean(N0);   % 1. Demean
+	N1=N1/std(N1(:));     % 2. variance normalize
+	% N2, formed by 1. Demean, 2. remove columns that are badly conditions due to low z (z<0.1) mean value, 3. global variance normalize the matrix
+	abs_mean_NET=abs(mean(N0));                             % get mean, take abs val
+	N2=nets_demean(N0./repmat(abs_mean_NET,size(N0,1),1));  % 1. demean
+	N2(:,abs_mean_NET<0.1)=[];                              % 2. remove columns with mean value <0.1                          
+	N2=N2/std(N2(:));                                       % 3. variance normalize
+	% N3, formed by horizontally concat N1 and N2
+	N3=[N1 N2]; % Concat horizontally
+	% N4, formed by regressing the confounds matrix out of N3
+	N4=nets_demean(N3-conf*(pinv(conf)*N3));
+	% N5
+	[N5,ss1,vv1]=nets_svds(N4,N_dim); % PCA of netmat via SVD reduction
+
+	% --- Save matrices ---
+	writematrix(conf, sprintf('%s/data/%d/conf.txt', abcd_cca_dir, n_subs));
+	writematrix(N0, sprintf('%s/data/%d/N0.txt', abcd_cca_dir, n_subs));
+	writematrix(N5, sprintf('%s/data/%d/N5.txt', abcd_cca_dir, n_subs));
+	writematrix(S1, sprintf('%s/data/%d/S1.txt', abcd_cca_dir, n_subs));
+	writematrix(S5, sprintf('%s/data/%d/S5.txt', abcd_cca_dir, n_subs));
+
+end
diff --git a/abcd_cca_80-20.m b/abcd_cca_80-20.m
@@ -0,0 +1,236 @@
+%% Set up
+
+perms_per_batch_in = 2000;
+N_perm_in = 100000;
+N_dim_in = 70;
+n_subs_in = 5013;
+abcd_cca_dir    =   '/data/NIMH_scratch/abcd_cca/abcd_cca_replication/';
+
+if ~isdeployed
+    addpath(genpath(sprintf('%s/dependencies/', abcd_cca_dir)));
+    addpath(genpath(sprintf('%s/data/', abcd_cca_dir)));
+    perms_per_batch =   perms_per_batch_in;
+    N_perm          =   N_perm_in;
+    N_dim           =   N_dim_in;
+    n_subs          =   n_subs_in;
+elseif isdeployed
+    % When compiled matlab, it reads the command line args all as strings so we need to convert
+    perms_per_batch =   str2num(perms_per_batch_in);
+    N_perm          =   str2num(N_perm_in);
+    N_dim           =   str2num(N_dim_in);
+    n_subs          =   str2num(n_subs_in);
+end
+
+melodic_folder  =   sprintf('%s/data_prep/data/stage_3/%d.gica', abcd_cca_dir, n_subs);
+SUMPICS         =   sprintf('%s/melodic_IC_thin.sum', melodic_folder);
+SUMPICS_THICK   =   sprintf('%s/melodic_IC_thick.sum', melodic_folder);
+
+% read in VARS and NET and do some preprocessing
+
+% Load the Subjects X Nodes matrix (should be size Nx19900)
+NET = load(sprintf('%s/data/%d/NET.txt', abcd_cca_dir, n_subs));
+
+% VARS_0 = Subjects X SMs text file
+VARS_0 = strcsvread(sprintf('%s/data/%d/VARS.txt', abcd_cca_dir, n_subs));
+
+% Load list of SMs to be used in ICA (this list is made manually)
+ica_sms_0=fileread(sprintf('%s/data/ica_subject_measures.txt', abcd_cca_dir));
+ica_sms = strsplit(ica_sms_0);
+
+% Load list of names of colums used to encode scanner ID
+scanner_col_names_0=fileread(sprintf('%s/data/%d/scanner_confounds.txt', abcd_cca_dir, n_subs));
+scanner_col_names = strsplit(scanner_col_names_0);
+
+% Drop subject col and device serial number col (they are strings)
+egid_col    = find(strcmpi(VARS_0(1,:),'subjectid'));
+serial_col  = find(strcmpi(VARS_0(1,:),'mri_info_device.serial.number'));
+VARS_0(:,[egid_col serial_col])=[];
+
+% Get column indices of our confound variables
+[sharedvals,scanner_cols_idx]=intersect(VARS_0(1,:),scanner_col_names);
+site_col        = find(strcmpi(VARS_0(1,:),'abcd_site'));
+mri_man_col     = find(strcmpi(VARS_0(1,:),'mri_info_manufacturer'));
+mean_fd_col     = find(strcmpi(VARS_0(1,:),'mean_fd'));
+bmi_col         = find(strcmpi(VARS_0(1,:),'anthro_bmi_calc'));
+weight_col      = find(strcmpi(VARS_0(1,:),'anthro_weight_calc'));
+wholebrain_col  = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_wholebrain'));
+intracran_col   = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_intracranialvolume'));
+
+% Now get column indices of the ICA SMs
+[sharedvals,ica_sms_idx]=intersect(VARS_0(1,:),ica_sms);
+
+% Store the original order of the ICA SMs, used later to make our pos-neg axis
+sms_original_order = VARS_0(1,:);
+
+% VARS without column names
+VARS=cell2mat(VARS_0(2:end,:));
+
+% Create confounds matrix
+% NOTE, since we use the same nuisance variable matrix conf (aka Z), this is a PARTIAL CCA where the same nuisance variable matrix (Z) is used for both the SM and connectome matrices
+% (see Winkler Et al. Permutation inference in CCA, Neuroimage 2020)
+
+% Per conversation with Anderson, we might want to consider NOT doing this double normalization thing, instead just add an intercept column to our confounds matrix
+conf  = palm_inormal([ VARS(:,scanner_cols_idx) VARS(:,[mean_fd_col bmi_col weight_col]) VARS(:,[wholebrain_col intracran_col]).^(1/3) ]);  % Gaussianise
+conf(isnan(conf)|isinf(conf)) = 0;                % impute missing data as zeros
+conf  = nets_normalise([conf conf(:,length(scanner_cols_idx):end).^2]);  % add on squared terms and renormalise (all cols other than those for scanner IDs)
+conf(isnan(conf)|isinf(conf)) = 0;                % again convert NaN/inf to 0 (above line makes them reappear for some reason)
+
+%% 10 fold 80-20 validation
+for II=1:10
+    % Sort the sample into train and test groups
+    % get the family IDs
+    fam = VARS(:,37)';
+
+    % G1 will have 80% G2 will have everyone not in G1
+    G1 = [];
+    % families without replacement so same subject isn't drawn twice
+    fam_norepl = fam;
+    idx = 1:5013;
+    G1_logi = logical(zeros(1,length(fam)));
+    G2_logi = logical(zeros(1,length(fam)));
+    n_subs = length(fam);
+
+    % 80% of sample size
+    target_80 = round(n_subs*0.8);
+
+    % while less than 80% of sample size in G1
+    while (length(G1)<target_80)
+        % randomly draw a family from the without replacement list
+        rand = randsample(fam_norepl,1);
+        % find the indicies of all siblings that match family ID
+        sibs = find(fam==rand);
+        % append to G1
+        G1 = [G1 sibs];
+        % create a logical for those 
+        G1_logi(sibs) = 1;
+        % remove family from further drawings
+        fam_norepl(fam_norepl==rand) = [];
+    end
+
+    % G2 is everyone who hasn't been assigned to G1
+    G2_logi(G1_logi==0) = 1;
+    G2 = idx(G2_logi);
+
+    % preprocess the new NET and VARS from G1
+    % for svd
+	Nkeep1=100; 
+	Nkeep2=100;
+
+	% conf for group 1
+	tmpconf=conf(G1,:);
+
+	% NET for group 1, standardize and take svd
+	tmpNET=NET(G1,:);
+	NET1=nets_demean(tmpNET);
+	NET1=NET1/std(NET1(:));
+	amNET=abs(mean(NET));
+	NET3=nets_demean(tmpNET./repmat(amNET,size(tmpNET,1),1));
+	NET3(:,amNET<0.1)=[];
+	NET3=NET3/std(NET3(:));
+	grot=[NET1 NET3]; 
+	NETd=nets_demean(grot-tmpconf*(pinv(tmpconf)*grot)); 
+	[uu1G1,ss1G1,vv1G1]=nets_svds(NETd,Nkeep1); 
+
+	% VARS for group 1, standardize each column
+	varsd=palm_inormal(VARS(G1,:));
+	for i=1:size(varsd,2)
+		grot=(isnan(varsd(:,i))==0); 
+		grotconf=nets_demean(tmpconf(grot,:)); 
+		varsd(grot,i)=normalize(varsd(grot,i)-grotconf*(pinv(grotconf)*varsd(grot,i))); 
+	end
+	varsdCOV=zeros(size(varsd,1));
+	for i=1:size(varsd,1)
+		for j=1:size(varsd,1)
+			grot=varsd([i j],:); 
+			grot=cov(grot(:,sum(isnan(grot))==0)'); 
+			varsdCOV(i,j)=grot(1,2);  
+		end; 
+	end
+	varsdCOV2=nearestSPD(varsdCOV); % scatter(varsdCOV(:),varsdCOV2(:));
+	[uu,dd]=eigs(varsdCOV2,Nkeep2); 
+	uu2G1=uu-tmpconf*(pinv(tmpconf)*uu);   % deconfound again just to be safe
+	ss2G1=sqrt(dd); 
+	grot=uu2G1 * inv(ss2G1); 
+	vv2G1=zeros(size(varsd',1),size(grot,2));    %  vv2G1 = varsd' * grot; % try to get the other eigenvectors
+	for i=1:size(varsd',1)
+	    for j=1:size(grot,2)
+	      groti=isnan(varsd(:,i))==0;
+	      vv2G1(i,j) = varsd(groti,i)' * grot(groti,j) * length(groti) / sum(groti);
+	    end
+    end
+
+    % Do the CCA on the 80%
+    [grotAG1,grotBG1,grotRG1,grotUG1,grotVG1,grotstatsG1]=canoncorr(uu1G1,uu2G1); 
+
+    % conf
+	tmpconf=conf(G2,:);   % now multiply the CCA outputs into the test dataset
+
+	% NET for group 2, standardize
+	tmpNET=NET(G2,:);
+	NET1=nets_demean(tmpNET);
+	NET1=NET1/std(NET1(:));
+	amNET=abs(mean(NET));
+	NET3=nets_demean(tmpNET./repmat(amNET,size(tmpNET,1),1));
+	NET3(:,amNET<0.1)=[];
+	NET3=NET3/std(NET3(:));
+	grot=[NET1 NET3]; 
+	NETd=nets_demean(grot-tmpconf*(pinv(tmpconf)*grot));
+
+	% VARS for group 2, standardize
+	varsd=palm_inormal(VARS(G2,:));
+	for i=1:size(varsd,2)
+		grot=(isnan(varsd(:,i))==0); 
+		grotconf=nets_demean(tmpconf(grot,:)); 
+		varsd(grot,i)=normalize(varsd(grot,i)-grotconf*(pinv(grotconf)*varsd(grot,i))); 
+    end
+
+    % compare to previous CCA on 80%
+	grot_U2 = NETd  * vv1G1 * ss1G1 * grotAG1;
+	grot=vv2G1 * ss2G1 * grotBG1; 
+	grot_V2=zeros(size(varsd,1),size(grot,2));        % grot_V2 = varsd * vv2G1 * ss2G1 * grotBG1   =  varsd * grot
+	for i=1:size(varsd,1)
+		for j=1:size(grot,2)
+			groti=isnan(varsd(i,:))'==0; 
+			grot_V2(i,j) = varsd(i,groti) * grot(groti,j) * length(groti) / sum(groti); 
+		end; 
+    end
+
+    grotRRR(II) = corr(grot_U2(:,2),grot_V2(:,2))   % correlate the test-data U and V then permute to check p-values
+
+    Nperm=1000;
+
+    % read in unaltered VARS
+    VARS_forG2 = strcsvread(sprintf('%s/data/%d/VARS.txt', abcd_cca_dir, n_subs));
+
+    % subset VARS for G2, put header are write to csv
+    G2_vars = VARS_forG2(2:end,:);
+    G2_vars = G2_vars(G2,:);
+    T = cell2table(G2_vars);
+    T.Properties.VariableNames = VARS_forG2(1,:);
+    writetable(T,'G2_vars.csv')
+
+    % run abcd2blocks
+    [EB,tab] = abcd2blocks('G2_vars.csv', 'G2_blocks.csv', [100 10]);
+
+    % run palm
+    Pset=palm_quickperms([ ], EB, Nperm, true, false, true, true);
+
+    % run through the permutations
+    for j=1:Nperm
+        grotRRRnull(j)=corr(grot_U2(:,2),grot_V2(Pset(:,j),2)); 
+    end
+
+
+    grotRRRp(II)=(1+sum(grotRRRnull(2:end,1)>=grotRRR(II)))/Nperm
+	grotRRRm(II)=mean(grotRRRnull); 
+    grotRRRs(II)=std(grotRRRnull);
+end
+
+save('80-20-mode2.mat','grotRRR','grotRRRp','grotRRRm','grotRRRs') 
+
+[grotRRR' grotRRRp' grotRRRm' grotRRRs']
+mean([grotRRR' grotRRRp' grotRRRm' grotRRRs'])
+
+
+
+