diff --git a/abcd_basic_proc.m b/abcd_basic_proc.m new file mode 100755 index 0000000..90298d7 --- /dev/null +++ b/abcd_basic_proc.m @@ -0,0 +1,137 @@ +% abcd_basic_proc.m +% Written by Nikhil Goyal, National Institute of Mental Health, 2019-2020 +% Created: June 2020 +% Modified: 7/1/2020 + +% CCA processing script for the ABCD connectome and SM data +% Required inputs are the NET.txt and VARS.txt files. + +function abcd_basic_proc(N_perm, N_dim, abcd_cca_dir, n_subs) + if nargin<4 + sprintf("ERROR, not enough arguments.") + sprintf("Example: abcd_basic_proc(100000, 70, '/data/ABCD_MBDU/goyaln2/abcd_cca_replication/', 5013)") + return + end + + %% Add the dependencies folders to the PATH, and read in necessary data + if ~isdeployed + addpath(genpath(sprintf('%s/dependencies/', abcd_cca_dir))); + addpath(genpath(sprintf('%s/data/', abcd_cca_dir))); + end + + % Read in data, set some variables, create confounds matrix + % VARS_0 = Subjects X SMs text file + VARS_0=strcsvread(sprintf('%s/data/%d/VARS.txt', abcd_cca_dir, n_subs)); + + % Load the Subjects X Nodes matrix (should be size Nx19900) + N0=load(sprintf('%s/data/%d/NET.txt', abcd_cca_dir, n_subs)); + + % Load list of SMs to be used in ICA (this list is made manually) + ica_sms_0=fileread(sprintf('%s/data/ica_subject_measures.txt', abcd_cca_dir)); + ica_sms = strsplit(ica_sms_0); + + % Load list of names of colums used to encode scanner ID + scanner_col_names_0=fileread(sprintf('%s/data/%d/scanner_confounds.txt', abcd_cca_dir, n_subs)); + scanner_col_names = strsplit(scanner_col_names_0); + + % Drop subject col and device serial number col (they are strings) + egid_col = find(strcmpi(VARS_0(1,:),'subjectid')); + serial_col = find(strcmpi(VARS_0(1,:),'mri_info_device.serial.number')); + VARS_0(:,[egid_col serial_col])=[]; + + % Get column indices of our confound variables + [sharedvals,scanner_cols_idx]=intersect(VARS_0(1,:),scanner_col_names); + site_col = find(strcmpi(VARS_0(1,:),'abcd_site')); + mri_man_col = find(strcmpi(VARS_0(1,:),'mri_info_manufacturer')); + mean_fd_col = find(strcmpi(VARS_0(1,:),'mean_fd')); + bmi_col = find(strcmpi(VARS_0(1,:),'anthro_bmi_calc')); + weight_col = find(strcmpi(VARS_0(1,:),'anthro_weight_calc')); + wholebrain_col = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_wholebrain')); + intracran_col = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_intracranialvolume')); + + % Get column indices of the ICA SMs + [sharedvals,ica_sms_idx]=intersect(VARS_0(1,:),ica_sms); + + % VARS without column names + VARS=cell2mat(VARS_0(2:end,:)); + + % --- CREATE CONFOUND MATRIX --- + % Set up confounds matrix. Confounds are demeaned, any missing data is set to 0 + % Confounds are: + % 1. Individual scanner IDs (subsitute for abcd_site and mri_info_manufacturer) + % 2. mean_fd + % 3. anthro_bmi_calc + % 4. anthro_weight_calc + % 5. smri_vol_subcort.aseg_wholebrain + % 6. smri_vol_subcort.aseg_intracranialvolume + % Additional confounds from demeaning and squaring all confounds + conf = palm_inormal([ VARS(:,scanner_cols_idx) VARS(:,[mean_fd_col bmi_col weight_col]) VARS(:,[wholebrain_col intracran_col]).^(1/3) ]); % Gaussianise + conf(isnan(conf)|isinf(conf)) = 0; % impute missing data as zeros + conf = nets_normalise([conf conf(:,length(scanner_cols_idx):end).^2]); % add on squared terms and renormalise (all cols other than those for scanner IDs) + conf(isnan(conf)|isinf(conf)) = 0; % again convert NaN/inf to 0 (above line makes them reappear for some reason) + + % --- SM PROCESSING --- + % Matrix S1 (only ICA sms) + S1=[VARS(:,ica_sms_idx)]; + + % Now, prepare the final SM matrix and run the PCA + % S2, formed by gaussianizing the SMs we keep + S2=palm_inormal(S1); % Gaussianise + + % Now generate S3, formed by deconfounding the 17 confounds out of S2 + S3=S2; + for i=1:size(S3,2) % deconfound ignoring missing data + grot=(isnan(S3(:,i))==0); + grotconf=nets_demean(conf(grot,:)); + S3(grot,i)=normalize(S3(grot,i)-grotconf*(pinv(grotconf)*S3(grot,i))); + end + + % Determine how much data is missing: + sum(sum(isnan(S3)))/(size(S3,1)*size(S3,2))*100 + + % Next, we need to generate S4 + % First, estimate the SubjectsXSubjects covariance matrix (where for any two subjects, SMs missing for either subject are ignored) + % The approximate covariance matrix (varsdCOV) is then projected onto the nearest valid covariance matrix using nearestSPD toolbox. + % This method avoids imputation, and S4 can be fed into PCA. + S3Cov=zeros(size(S3,1)); + for i=1:size(S3,1) % estimate "pairwise" covariance, ignoring missing data + for j=1:size(S3,1) + grot=S3([i j],:); + grot=cov(grot(:,sum(isnan(grot))==0)'); + S3Cov(i,j)=grot(1,2); + end + end + S4=nearestSPD(S3Cov); % project onto the nearest valid covariance matrix. This method avoids imputation (we can't have any missing values before running the PCA) + + % Check the before and after correlation: + corrcoef(S4,S3Cov) %0.9999 for the 5013 subject sample + + % Generate S5, the top eigenvectors for SMs, to avoid overfitting and reduce dimensionality + [uu,dd]=eigs(S4,N_dim); % SVD (eigs actually) + S5=uu-conf*(pinv(conf)*uu); % deconfound again just to be safe + + % --- NETMAT PROCESSING --- + fprintf("Calculating netmat matrices N1 through N5\n") + % N1, formed by 1. demean, 2. globally variance normalize + N1=nets_demean(N0); % 1. Demean + N1=N1/std(N1(:)); % 2. variance normalize + % N2, formed by 1. Demean, 2. remove columns that are badly conditions due to low z (z<0.1) mean value, 3. global variance normalize the matrix + abs_mean_NET=abs(mean(N0)); % get mean, take abs val + N2=nets_demean(N0./repmat(abs_mean_NET,size(N0,1),1)); % 1. demean + N2(:,abs_mean_NET<0.1)=[]; % 2. remove columns with mean value <0.1 + N2=N2/std(N2(:)); % 3. variance normalize + % N3, formed by horizontally concat N1 and N2 + N3=[N1 N2]; % Concat horizontally + % N4, formed by regressing the confounds matrix out of N3 + N4=nets_demean(N3-conf*(pinv(conf)*N3)); + % N5 + [N5,ss1,vv1]=nets_svds(N4,N_dim); % PCA of netmat via SVD reduction + + % --- Save matrices --- + writematrix(conf, sprintf('%s/data/%d/conf.txt', abcd_cca_dir, n_subs)); + writematrix(N0, sprintf('%s/data/%d/N0.txt', abcd_cca_dir, n_subs)); + writematrix(N5, sprintf('%s/data/%d/N5.txt', abcd_cca_dir, n_subs)); + writematrix(S1, sprintf('%s/data/%d/S1.txt', abcd_cca_dir, n_subs)); + writematrix(S5, sprintf('%s/data/%d/S5.txt', abcd_cca_dir, n_subs)); + +end \ No newline at end of file diff --git a/abcd_cca_80-20.m b/abcd_cca_80-20.m new file mode 100755 index 0000000..628cc41 --- /dev/null +++ b/abcd_cca_80-20.m @@ -0,0 +1,236 @@ +%% Set up + +perms_per_batch_in = 2000; +N_perm_in = 100000; +N_dim_in = 70; +n_subs_in = 5013; +abcd_cca_dir = '/data/NIMH_scratch/abcd_cca/abcd_cca_replication/'; + +if ~isdeployed + addpath(genpath(sprintf('%s/dependencies/', abcd_cca_dir))); + addpath(genpath(sprintf('%s/data/', abcd_cca_dir))); + perms_per_batch = perms_per_batch_in; + N_perm = N_perm_in; + N_dim = N_dim_in; + n_subs = n_subs_in; +elseif isdeployed + % When compiled matlab, it reads the command line args all as strings so we need to convert + perms_per_batch = str2num(perms_per_batch_in); + N_perm = str2num(N_perm_in); + N_dim = str2num(N_dim_in); + n_subs = str2num(n_subs_in); +end + +melodic_folder = sprintf('%s/data_prep/data/stage_3/%d.gica', abcd_cca_dir, n_subs); +SUMPICS = sprintf('%s/melodic_IC_thin.sum', melodic_folder); +SUMPICS_THICK = sprintf('%s/melodic_IC_thick.sum', melodic_folder); + +% read in VARS and NET and do some preprocessing + +% Load the Subjects X Nodes matrix (should be size Nx19900) +NET = load(sprintf('%s/data/%d/NET.txt', abcd_cca_dir, n_subs)); + +% VARS_0 = Subjects X SMs text file +VARS_0 = strcsvread(sprintf('%s/data/%d/VARS.txt', abcd_cca_dir, n_subs)); + +% Load list of SMs to be used in ICA (this list is made manually) +ica_sms_0=fileread(sprintf('%s/data/ica_subject_measures.txt', abcd_cca_dir)); +ica_sms = strsplit(ica_sms_0); + +% Load list of names of colums used to encode scanner ID +scanner_col_names_0=fileread(sprintf('%s/data/%d/scanner_confounds.txt', abcd_cca_dir, n_subs)); +scanner_col_names = strsplit(scanner_col_names_0); + +% Drop subject col and device serial number col (they are strings) +egid_col = find(strcmpi(VARS_0(1,:),'subjectid')); +serial_col = find(strcmpi(VARS_0(1,:),'mri_info_device.serial.number')); +VARS_0(:,[egid_col serial_col])=[]; + +% Get column indices of our confound variables +[sharedvals,scanner_cols_idx]=intersect(VARS_0(1,:),scanner_col_names); +site_col = find(strcmpi(VARS_0(1,:),'abcd_site')); +mri_man_col = find(strcmpi(VARS_0(1,:),'mri_info_manufacturer')); +mean_fd_col = find(strcmpi(VARS_0(1,:),'mean_fd')); +bmi_col = find(strcmpi(VARS_0(1,:),'anthro_bmi_calc')); +weight_col = find(strcmpi(VARS_0(1,:),'anthro_weight_calc')); +wholebrain_col = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_wholebrain')); +intracran_col = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_intracranialvolume')); + +% Now get column indices of the ICA SMs +[sharedvals,ica_sms_idx]=intersect(VARS_0(1,:),ica_sms); + +% Store the original order of the ICA SMs, used later to make our pos-neg axis +sms_original_order = VARS_0(1,:); + +% VARS without column names +VARS=cell2mat(VARS_0(2:end,:)); + +% Create confounds matrix +% NOTE, since we use the same nuisance variable matrix conf (aka Z), this is a PARTIAL CCA where the same nuisance variable matrix (Z) is used for both the SM and connectome matrices +% (see Winkler Et al. Permutation inference in CCA, Neuroimage 2020) + +% Per conversation with Anderson, we might want to consider NOT doing this double normalization thing, instead just add an intercept column to our confounds matrix +conf = palm_inormal([ VARS(:,scanner_cols_idx) VARS(:,[mean_fd_col bmi_col weight_col]) VARS(:,[wholebrain_col intracran_col]).^(1/3) ]); % Gaussianise +conf(isnan(conf)|isinf(conf)) = 0; % impute missing data as zeros +conf = nets_normalise([conf conf(:,length(scanner_cols_idx):end).^2]); % add on squared terms and renormalise (all cols other than those for scanner IDs) +conf(isnan(conf)|isinf(conf)) = 0; % again convert NaN/inf to 0 (above line makes them reappear for some reason) + +%% 10 fold 80-20 validation +for II=1:10 + % Sort the sample into train and test groups + % get the family IDs + fam = VARS(:,37)'; + + % G1 will have 80% G2 will have everyone not in G1 + G1 = []; + % families without replacement so same subject isn't drawn twice + fam_norepl = fam; + idx = 1:5013; + G1_logi = logical(zeros(1,length(fam))); + G2_logi = logical(zeros(1,length(fam))); + n_subs = length(fam); + + % 80% of sample size + target_80 = round(n_subs*0.8); + + % while less than 80% of sample size in G1 + while (length(G1)=grotRRR(II)))/Nperm + grotRRRm(II)=mean(grotRRRnull); + grotRRRs(II)=std(grotRRRnull); +end + +save('80-20-mode2.mat','grotRRR','grotRRRp','grotRRRm','grotRRRs') + +[grotRRR' grotRRRp' grotRRRm' grotRRRs'] +mean([grotRRR' grotRRRp' grotRRRm' grotRRRs']) + + + + diff --git a/abcd_cca_batch.m b/abcd_cca_batch.m new file mode 100755 index 0000000..fc72809 --- /dev/null +++ b/abcd_cca_batch.m @@ -0,0 +1,149 @@ +% abcd_cca_batch.m +% Written by Nikhil Goyal, National Institute of Mental Health, 2019-2020 +% Created: 7/9/20 +% Modified: 7/10/20 (modified to do some calcuations on the set of perms (percentiles, means)) + +% Script is used in batch processing to calculate CCA for each of the 100,000 permutations we generate +% This script is given a start index and the number of permutations to calculate +% NOTE: this script is COMPILED, and it expects input args to be strings (from cmd line), so we type cast the args + +% How to compile this script on NIH biowulf using the mcc2 utility: +% mcc2 -m abcd_cca_batch.m -d compiled_scripts/ -nojvm -a ./dependencies/palm-alpha116/palm_inormal.m + +% And to run a SINGLE INSTANCE of it (this is NOT how it is called on slurm): +%{ +export XAPPLRESDIR=MR/v98/X11/app-defaults +export LD_LIBRARY_PATH=MR/v98/runtime/glnxa64:MR/v98/bin/glnxa64:MR/v98/sys/os/glnxa64:MR/v98/sys/opengl/lib/glnxa64 +./run_abcd_cca_batch.sh /usr/local/matlab-compiler/v98 1 1000 70 /data/ABCD_MBDU/goyaln2/abcd_cca_replication/ 500 +%} + +% To see the command structure for HPC slurm for this script, see gen_batch_permutation_ica_swarm.py + +% The output of this script is a .mat file that contains data necessary for null distribution permutation testing +% For the sake of aggregating this data, some computations are done here (finding percentiles and means) on the num_perms calculated (reduces memory and computational overhead later in analysis when we need to use this data) + +function abcd_cca_batch(start_idx_in, num_perms_in, N_dim_in, abcd_cca_dir, n_subs_in) + if nargin<5 + sprintf("ERROR, not enough arguments.") + sprintf("Example: abcd_cca_batch(1, 1000, 70, '/data/ABCD_MBDU/goyaln2/abcd_cca_replication/', 1000)") + return + end + + if ~isdeployed + addpath(genpath(sprintf('%s/dependencies/', abcd_cca_dir))); + addpath(genpath(sprintf('%s/data/', abcd_cca_dir))); + start_idx = start_idx_in; + num_perms = num_perms_in; + N_dim = N_dim_in; + n_subs = n_subs_in; + elseif isdeployed + % When compiled matlab, it reads the command line args all as strings so we need to convert + start_idx = str2num(start_idx_in); + num_perms = str2num(num_perms_in); + N_dim = str2num(N_dim_in); + n_subs = str2num(n_subs_in); + end + + % Load data + % Matrix S1 (only ICA sms) + s1 = sprintf('%s/data/%d/S1.txt', abcd_cca_dir, n_subs); + % Matrix S5 (post-PCA SM matrix) + s5 = sprintf('%s/data/%d/S5.txt', abcd_cca_dir, n_subs); + % Matrix N0 (raw connectome data) + n0 = sprintf('%s/data/%d/N0.txt', abcd_cca_dir, n_subs); + % Matrix N5 (post-PCA connectome matrix) + n5 = sprintf('%s/data/%d/N5.txt', abcd_cca_dir, n_subs); + + S1 = load(s1); + S5 = load(s5); + N0 = load(n0); + N5 = load(n5); + + % Permutation matrix + pset=sprintf('%s/data/%d/Pset.txt', abcd_cca_dir, n_subs); + Pset=load(pset); + + % Temporary version of VARS used for permutation calcs + grotvars=palm_inormal(S1); + grotvars(:,std(grotvars)<1e-10)=[]; + grotvars(:,sum(isnan(grotvars)==0)<20)=[]; + + + % Varibles we calculate for each permutation: + % r = the vector r for the permutation + % nullNETr = the permutation's mode 1 connectome weights correlation to the RAW connectome matrix + % nullNETv = for this permutation, the summation of the correlation values between each mode's connectome weights and the RAW connectome matrix + % nullSMr = for this permutation, CCA mode 1 SM weights correalated against the S1 matrix + % nullSMv = for this permutation, the summation of the correlation values between each mode's SM weights and the S1 matrix + + % Aggregation variables + r_agg = []; + nullNETr_agg = []; + nullNETv_agg = []; + nullSMr_agg = []; + nullSMv_agg = []; + + r=zeros(N_dim+1, 1); + for perm = start_idx:(start_idx+num_perms-1) + % Note, need to range from start_idx:(start_idx+num_perms-1) because without the -1 on the final iteration perm will exceed 100,000 becoming 100,001 and causing an error + % Also, need the -1 for only num_perms permutations to be ran + [A, B, r(1:end-1), U, V, stats] = canoncorr(N5,S5(Pset(:,perm),:)); + r(end)=mean(r(1:end-1)); + + nullNETr = corr(U(:,1),N0)'; + nullNETv = sum(corr(U,N0).^2,2); + + nullSMr = corr(V(:,1),grotvars(Pset(:,perm),:),'rows','pairwise')'; + nullSMv = sum(corr(V,grotvars(Pset(:,perm),:),'rows','pairwise').^2,2); + + r_agg = [r_agg; r' ]; + nullNETr_agg = [nullNETr_agg; nullNETr']; + nullNETv_agg = [nullNETv_agg; nullNETv']; + nullSMr_agg = [nullSMr_agg; nullSMr']; + nullSMv_agg = [nullSMv_agg; nullSMv']; + + end + + % Now find the summary variables for these 1000 permutatations + % These are what we need to compute (taken from Steve Smith's code): + % NOTE - these are percentiles of the columns - the result should be a 1xN_dim vector (where the N_dim columns are CCA modes) + % prctile(nullNETv,5,1) mean(nullNETv,1) prctile(nullNETv,95,1) + % prctile(nullSMv,5,1) mean(nullSMv,1) prctile(nullSMv,95,1) + % prctile( max(abs(nullSMr)) ,95) + % prctile( max(abs(nullNETr)) ,95) + + s = struct( 'start_idx', {}, ... + 'num_perms', {}, ... + 'nullNETv_prctile_95', {}, ... + 'nullNETv_prctile_5', {}, ... + 'nullNETv_mean', {}, ... + 'nullSMv_prctile_95', {}, ... + 'nullSMv_prctile_5', {}, ... + 'nullSMv_mean', {}, ... + 'nullNETr_prctile_95', {}, ... + 'nullSMr_prctile_95', {}, ... + 'r', {} ); + + s(1).start_idx = start_idx; + s(1).num_perms = num_perms; + + s(1).nullNETv_prctile_95 = prctile( nullNETv_agg, 95,1); + s(1).nullNETv_prctile_5 = prctile( nullNETv_agg, 5,1); + s(1).nullNETv_mean = mean( nullNETv_agg, 1); + s(1).nullNETv_std = std(nullNETv_agg, 1); + + s(1).nullSMv_prctile_95 = prctile( nullSMv_agg, 95, 1); + s(1).nullSMv_prctile_5 = prctile( nullSMv_agg, 5, 1); + s(1).nullSMv_mean = mean( nullSMv_agg, 1); + s(1).nullSMv_std = std(nullSMv_agg, 1); + + s(1).nullNETr_prctile_95 = prctile( max(abs(nullNETr_agg)), 95); + s(1).nullSMr_prctile_95 = prctile( max(abs(nullSMr_agg)), 95); + s(1).r = r_agg; + s(1).nullNETr_std = std(nullNETr_agg, 1); + s(1).nullSMr_std = std(nullSMr_agg, 1); + + % Save .mat file with the permutations + save(sprintf('%s/data/%d/permutations/permutations_%d.mat', abcd_cca_dir, n_subs, start_idx), 's'); + +end \ No newline at end of file diff --git a/abcd_cca_interactive.m b/abcd_cca_interactive.m new file mode 100755 index 0000000..3e94906 --- /dev/null +++ b/abcd_cca_interactive.m @@ -0,0 +1,261 @@ +% NOTE: need the following loaded on path +% fsl, connectome-workbench + +perms_per_batch_in = 2000 +N_perm_in = 100000 +N_dim_in = 70 +n_subs_in = 5013 +abcd_cca_dir = '/data/NIMH_scratch/abcd_cca/abcd_cca_replication/'; + +if ~isdeployed + addpath(genpath(sprintf('%s/dependencies/', abcd_cca_dir))); + addpath(genpath(sprintf('%s/data/', abcd_cca_dir))); + perms_per_batch = perms_per_batch_in; + N_perm = N_perm_in; + N_dim = N_dim_in; + n_subs = n_subs_in; +elseif isdeployed + % When compiled matlab, it reads the command line args all as strings so we need to convert + perms_per_batch = str2num(perms_per_batch_in); + N_perm = str2num(N_perm_in); + N_dim = str2num(N_dim_in); + n_subs = str2num(n_subs_in); +end + +melodic_folder = sprintf('%s/data_prep/data/stage_3/%d.gica', abcd_cca_dir, n_subs); +SUMPICS = sprintf('%s/melodic_IC_thin.sum', melodic_folder); +SUMPICS_THICK = sprintf('%s/melodic_IC_thick.sum', melodic_folder); + +%% --- Read in data, set some variables, create confounds matrix --- +% load in data from FSLNets calculations +fslnets_mat = load(sprintf('%s/data_prep/data/stage_4/%d/fslnets.mat', abcd_cca_dir, n_subs)); + +% Load the Subjects X Nodes matrix (should be size Nx19900) +N0 = load(sprintf('%s/data/%d/NET.txt', abcd_cca_dir, n_subs)); + +% VARS_0 = Subjects X SMs text file +VARS_0 = strcsvread(sprintf('%s/data/%d/VARS.txt', abcd_cca_dir, n_subs)); + +% Load list of SMs to be used in ICA (this list is made manually) +ica_sms_0=fileread(sprintf('%s/data/ica_subject_measures.txt', abcd_cca_dir)); +ica_sms = strsplit(ica_sms_0); + +% Load list of names of colums used to encode scanner ID +scanner_col_names_0=fileread(sprintf('%s/data/%d/scanner_confounds.txt', abcd_cca_dir, n_subs)); +scanner_col_names = strsplit(scanner_col_names_0); + +% Drop subject col and device serial number col (they are strings) +egid_col = find(strcmpi(VARS_0(1,:),'subjectid')); +serial_col = find(strcmpi(VARS_0(1,:),'mri_info_device.serial.number')); +VARS_0(:,[egid_col serial_col])=[]; + +% Get column indices of our confound variables +[sharedvals,scanner_cols_idx]=intersect(VARS_0(1,:),scanner_col_names); +site_col = find(strcmpi(VARS_0(1,:),'abcd_site')); +mri_man_col = find(strcmpi(VARS_0(1,:),'mri_info_manufacturer')); +mean_fd_col = find(strcmpi(VARS_0(1,:),'mean_fd')); +bmi_col = find(strcmpi(VARS_0(1,:),'anthro_bmi_calc')); +weight_col = find(strcmpi(VARS_0(1,:),'anthro_weight_calc')); +wholebrain_col = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_wholebrain')); +intracran_col = find(strcmpi(VARS_0(1,:),'smri_vol_subcort.aseg_intracranialvolume')); + +% Now get column indices of the ICA SMs +[sharedvals,ica_sms_idx]=intersect(VARS_0(1,:),ica_sms); + +% Store the original order of the ICA SMs, used later to make our pos-neg axis +sms_original_order = VARS_0(1,:); + +% VARS without column names +VARS=cell2mat(VARS_0(2:end,:)); + +% Create confounds matrix +% NOTE, since we use the same nuisance variable matrix conf (aka Z), this is a PARTIAL CCA where the same nuisance variable matrix (Z) is used for both the SM and connectome matrices +% (see Winkler Et al. Permutation inference in CCA, Neuroimage 2020) + +% Per conversation with Anderson, we might want to consider NOT doing this double normalization thing, instead just add an intercept column to our confounds matrix +conf = palm_inormal([ VARS(:,scanner_cols_idx) VARS(:,[mean_fd_col bmi_col weight_col]) VARS(:,[wholebrain_col intracran_col]).^(1/3) ]); % Gaussianise +conf(isnan(conf)|isinf(conf)) = 0; % impute missing data as zeros +conf = nets_normalise([conf conf(:,length(scanner_cols_idx):end).^2]); % add on squared terms and renormalise (all cols other than those for scanner IDs) +conf(isnan(conf)|isinf(conf)) = 0; % again convert NaN/inf to 0 (above line makes them reappear for some reason) + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% --- SM PROCESSING --- +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% Matrix S1 (only ICA sms) +S1=[VARS(:,ica_sms_idx)]; + +% Now, prepare the final SM matrix and run the PCA +% S2, formed by gaussianizing the SMs we keep +S2=palm_inormal(S1); % Gaussianise + + % Now generate S3, formed by deconfounding the 17 confounds out of S2 +S3=S2; +for i=1:size(S3,2) % deconfound ignoring missing data + grot=(isnan(S3(:,i))==0); + grotconf=nets_demean(conf(grot,:)); + S3(grot,i)=normalize(S3(grot,i)-grotconf*(pinv(grotconf)*S3(grot,i))); +end + +% Determine how much data is missing: +sum(sum(isnan(S3)))/(size(S3,1)*size(S3,2))*100 + +% Next, we need to generate S4 +% First, estimate the SubjectsXSubjects covariance matrix (where for any two subjects, SMs missing for either subject are ignored) +% The approximate covariance matrix (varsdCOV) is then projected onto the nearest valid covariance matrix using nearestSPD toolbox. +% This method avoids imputation, and S4 can be fed into PCA. +S3Cov=zeros(size(S3,1)); +for i=1:size(S3,1) % estimate "pairwise" covariance, ignoring missing data + for j=1:size(S3,1) + grot=S3([i j],:); + grot=cov(grot(:,sum(isnan(grot))==0)'); + S3Cov(i,j)=grot(1,2); + end +end +S4=nearestSPD(S3Cov); % project onto the nearest valid covariance matrix. This method avoids imputation (we can't have any missing values before running the PCA) + +% Check the before and after correlation: +corrcoef(S4,S3Cov) %0.9999 for the 5013 subject sample + +% Generate S5, the top eigenvectors for SMs, to avoid overfitting and reduce dimensionality +[uu,dd]=eigs(S4,N_dim); % SVD (eigs actually) +S5=uu-conf*(pinv(conf)*uu); % deconfound again just to be safe + + +% --- NETMAT PROCESSING --- +fprintf("Calculating netmat matrices N1 through N5\n") +% N1, formed by 1. demean, 2. globally variance normalize +N1=nets_demean(N0); % 1. Demean +N1=N1/std(N1(:)); % 2. variance normalize +% N2, formed by 1. Demean, 2. remove columns that are badly conditions due to low z (z<0.1) mean value, 3. global variance normalize the matrix +abs_mean_NET=abs(mean(N0)); % get mean, take abs val +N2=nets_demean(N0./repmat(abs_mean_NET,size(N0,1),1)); % 1. demean +N2(:,abs_mean_NET<0.1)=[]; % 2. remove columns with mean value <0.1 +N2=N2/std(N2(:)); % 3. variance normalize +% N3, formed by horizontally concat N1 and N2 +N3=[N1 N2]; % Concat horizontally +% N4, formed by regressing the confounds matrix out of N3 +N4=nets_demean(N3-conf*(pinv(conf)*N3)); +% N5 +[N5,ss1,vv1]=nets_svds(N4,N_dim); % PCA of netmat via SVD reduction + +%% --- CCA with Qz--- +fprintf("Running CCA on matrices S5 and N5\n") +[A, B, R, U, V, initial_stats] = canoncorr(N5,S5); + +% Plot +figure; +scatter(U(:,1), V(:,1)) +% Plot with regression line +figure; +plotregression(U(:,1), V(:,1)) + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% --- POSITIVE NEGATIVE AXIS +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +% post hoc columns not included in CCA +bmi_col = find(strcmpi(VARS_0(1,:),'anthro_bmi_calc')); +educ_col = find(strcmpi(VARS_0(1,:),'high.educ')); +age_col = find(strcmpi(VARS_0(1,:),'age')); +income_col = find(strcmpi(VARS_0(1,:),'household.income.bl')); + +% SM indices +pneg_idx = [ica_sms_idx' bmi_col educ_col age_col income_col]; +% include / exclude vector +inc_exc = [ones(1,length(ica_sms_idx)) zeros(1,4)] +% SM names +pneg_names = VARS_0(1,pneg_idx) +% SM measures +pneg_vars = [VARS(:,ica_sms_idx) VARS(:,bmi_col) VARS(:,educ_col) VARS(:,age_col) VARS(:,income_col)]; + +% specify CCA mode +I = 2; +% run the correlations, the sign is arbitrary, so lets flip it to be +% consistent with Smith +CorCCA = -1*corr(V(:,I), palm_inormal(pneg_vars), 'rows','pairwise'); + +% open the figure +figure; +plot(CorCCA); % correlate CCA component 1 with original set of SMs +% Z scores +ZCCA = 12*0.5*log((1+CorCCA)./(1-CorCCA)); % r2z - factor x12 gets close to "real" zstats. +[tmp_B,tmp_I] = sort(CorCCA,'ascend'); % sort by corralation value +toplist = []; +strlist = {}; + +% Iterate over the SMs +for i = 1:length(tmp_I) + % Since we sort the values, we use ii to index into the unsorted list and get the proper index of the column from VARS + ii = tmp_I(i); + % Pull the values for the column associated with the SM being looked at + Y = pneg_vars(:,ii); + Y_no_nan_idx = ~isnan(Y); + Y = nets_demean(Y(Y_no_nan_idx)); + X = nets_demean(V(Y_no_nan_idx,I)); + VarExplained(ii) = var(X*(pinv(X)*Y)) / var(Y); + + if abs(tmp_B(i))>0.2 + BVname = string(pneg_names(ii)); + toplist=[toplist ii]; + str=sprintf('%.2f %.2f %.2f %s %s',CorCCA(ii),ZCCA(ii),VarExplained(ii),BVname); + strlist{end+1} = str; + end +end + +x = ones([length(toplist),1]); +dx = 0.1; +y = 1:length(toplist); +dy = 0.1; +set(0,'DefaultTextInterpreter','none') +scatter(x,y); +text(x+dx, y+dy, strlist); + + + +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +%% --- VARIANCE/PERMUTATION ANALYSIS --- +%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% +perm_data = load(sprintf('%s/data/%d/permutation_data.mat', abcd_cca_dir, n_subs)); +tmp_VARS = palm_inormal(S1); +tmp_VARS(:,std(tmp_VARS)<1e-10) = []; +tmp_VARS(:,sum(isnan(tmp_VARS)==0)<20) = []; + +for i = 1:N_dim; % show corrected pvalues + Rpval(i) = (1 + sum(perm_data.s_agg(1).r(2:end,1) >= R(i))) / N_perm; +end +Rpval; +Ncca=sum(Rpval<0.05) % number of significant CCA components +Rpval(1:Ncca) + +variance_data_NET = [ sum(corr(U, N0).^2,2)'; perm_data.s_agg(1).nullNETv_prctile_5; perm_data.s_agg(1).nullNETv_mean; perm_data.s_agg(1).nullNETv_prctile_95; sum(corr(N5,N0).^2,2)' ] * 100 / size(N0,2); +variance_data_VARS = [ sum(corr(V,tmp_VARS,'rows','pairwise').^2,2)'; perm_data.s_agg(1).nullSMv_prctile_5; perm_data.s_agg(1).nullSMv_mean; perm_data.s_agg(1).nullSMv_prctile_95; sum(corr(S5,tmp_VARS,'rows','pairwise').^2,2)' ] * 100 / size(tmp_VARS,2); + +z_scores_NET = [ (sum(corr(U,N0).^2,2)- perm_data.s_agg(1).nullNETv_mean') ./ perm_data.s_agg(1).nullNETv_std' ] ; +z_scores_SM = [ (sum(corr(V,tmp_VARS,'rows','pairwise').^2,2)- perm_data.s_agg(1).nullSMv_mean') ./ perm_data.s_agg(1).nullSMv_std' ]; + +% Look at first 20 modes +I=1:20; + +% Connectomes variance +figure; +subplot(2,1,1); +hold on; +% Draw the rectangles for null distributions per mode +for i=1:length(I) + rectangle('Position',[i-0.5 variance_data_NET(2,i) 1 variance_data_NET(4,i)-variance_data_NET(2,i)],'FaceColor',[0.8 0.8 0.8],'EdgeColor',[0.8 0.8 0.8]); +end +plot(variance_data_NET(3,I),'k'); +plot(variance_data_NET(1,I),'b'); +plot(variance_data_NET(1,I),'b.'); + + +% Subject measures variance +subplot(2,1,2); +hold on; +for i=1:length(I) + rectangle('Position',[i-0.5 variance_data_VARS(2,i) 1 variance_data_VARS(4,i)-variance_data_VARS(2,i)],'FaceColor',[0.8 0.8 0.8],'EdgeColor',[0.8 0.8 0.8]); +end +plot(variance_data_VARS(3,I),'k'); +plot(variance_data_VARS(1,I),'b'); +plot(variance_data_VARS(1,I),'b.'); diff --git a/abcd_perm_agg.m b/abcd_perm_agg.m new file mode 100755 index 0000000..52e9e78 --- /dev/null +++ b/abcd_perm_agg.m @@ -0,0 +1,113 @@ +% abcd_perm_agg.m +% Written by Nikhil Goyal, National Institute of Mental Health, 2019-2020 +% Created: 7/1/2020 +% Modified: 7/10/20 (modified to accomodate the outputs of the updated abcd_cca_batch.m which reduces the amount of data in each .mat file) + +% Script is used in batch processing to calculate CCA for each of the 100,000 permutations we generate + +function abcd_perm_agg(perms_per_batch_in, N_perm_in, N_dim_in, abcd_cca_dir, n_subs_in) + if nargin<5 + sprintf("ERROR, not enough arguments.") + sprintf("Example: abcd_perm_agg(2000, 100000, 70, '/data/ABCD_MBDU/goyaln2/abcd_cca_replication/', 5013)") + return + end + + if ~isdeployed + addpath(genpath(sprintf('%s/dependencies/', abcd_cca_dir))); + addpath(genpath(sprintf('%s/data/', abcd_cca_dir))); + perms_per_batch = perms_per_batch_in; + N_perm = N_perm_in; + N_dim = N_dim_in; + n_subs = n_subs_in; + elseif isdeployed + % When compiled matlab, it reads the command line args all as strings so we need to convert + perms_per_batch = str2num(perms_per_batch_in); + N_perm = str2num(N_perm_in); + N_dim = str2num(N_dim_in); + n_subs = str2num(n_subs_in); + end + + r_agg = []; + + nullNETv_prctile_95_agg = []; + nullNETv_prctile_5_agg = []; + nullNETv_mean_agg = []; + nullNETv_std_agg = []; + + nullSMv_prctile_95_agg = []; + nullSMv_prctile_5_agg = []; + nullSMv_mean_agg = []; + nullSMv_std_agg = []; + + + nullNETr_prctile_95_agg = []; + nullSMr_prctile_95_agg = []; + nullNETr_std_agg = []; + nullSMr_std_agg = []; + + for perm = 1:perms_per_batch:N_perm + % Iterate over 1 = 1, 1001, 2001, .... 99001 + % filename of .mat files we need to load are of form permutations_<#>.mat + + f_name = sprintf('%s/data/%d/permutations/permutations_%d.mat',abcd_cca_dir, n_subs, perm); + + % Load the .mat file + mat_file = load(f_name); + + r_agg = [r_agg; mat_file.s(1).r ]; + + nullNETv_prctile_95_agg = [nullNETv_prctile_95_agg; mat_file.s(1).nullNETv_prctile_95 ]; + nullNETv_prctile_5_agg = [nullNETv_prctile_5_agg; mat_file.s(1).nullNETv_prctile_5 ]; + nullNETv_mean_agg = [nullNETv_mean_agg; mat_file.s(1).nullNETv_mean ]; + nullNETv_std_agg = [nullNETv_std_agg; mat_file.s(1).nullNETv_std]; + + nullSMv_prctile_95_agg = [nullSMv_prctile_95_agg; mat_file.s(1).nullSMv_prctile_95 ]; + nullSMv_prctile_5_agg = [nullSMv_prctile_5_agg; mat_file.s(1).nullSMv_prctile_5 ]; + nullSMv_mean_agg = [nullSMv_mean_agg; mat_file.s(1).nullSMv_mean ]; + nullSMv_std_agg = [nullSMv_std_agg; mat_file.s(1).nullSMv_std]; + + nullNETr_prctile_95_agg = [nullNETr_prctile_95_agg; mat_file.s(1).nullNETr_prctile_95 ]; + nullSMr_prctile_95_agg = [nullSMr_prctile_95_agg; mat_file.s(1).nullSMr_prctile_95 ]; + nullNETr_std_agg = [nullNETr_std_agg; mat_file.s(1).nullNETr_std]; + nullSMr_std_agg = [nullSMr_std_agg; mat_file.s(1).nullSMr_std]; + + end + + s_agg = struct( 'perms_per_batch', {}, ... + 'tot_perms', {}, ... + 'nullNETv_prctile_95', {}, ... + 'nullNETv_prctile_5', {}, ... + 'nullNETv_mean', {}, ... + 'nullSMv_prctile_95', {}, ... + 'nullSMv_prctile_5', {}, ... + 'nullSMv_mean', {}, ... + 'nullNETr_prctile_95', {}, ... + 'nullSMr_prctile_95', {}, ... + 'r', {} ); + + s_agg(1).perms_per_batch = perms_per_batch; + s_agg(1).tot_perms = N_perm; + + s_agg(1).nullNETv_prctile_95 = mean( nullNETv_prctile_95_agg,1); + s_agg(1).nullNETv_prctile_5 = mean( nullNETv_prctile_5_agg,1); + + s_agg(1).nullNETv_mean = mean( nullNETv_mean_agg, 1); + s_agg(1).nullNETv_std = mean(nullNETv_std_agg, 1); + + s_agg(1).nullSMv_prctile_95 = mean( nullSMv_prctile_95_agg, 1); + s_agg(1).nullSMv_prctile_5 = mean( nullSMv_prctile_5_agg, 1); + + s_agg(1).nullSMv_mean = mean( nullSMv_mean_agg, 1); + s_agg(1).nullSMv_std = mean(nullSMv_std_agg, 1); + + s_agg(1).nullNETr_prctile_95 = prctile( max(abs(nullNETr_prctile_95_agg)), 95); + s_agg(1).nullSMr_prctile_95 = prctile( max(abs(nullSMr_prctile_95_agg)), 95); + s_agg(1).r = r_agg; + s_agg(1).nullNETr_std = mean(nullNETr_std_agg, 1); + s_agg(1).nullSMr_std = mean(nullSMr_std_agg, 1); + + + % Now save + save(sprintf('%s/data/%d/permutation_data.mat', abcd_cca_dir, n_subs), 's_agg'); + +end \ No newline at end of file diff --git a/abcd_perm_gen.m b/abcd_perm_gen.m new file mode 100755 index 0000000..1410940 --- /dev/null +++ b/abcd_perm_gen.m @@ -0,0 +1,44 @@ +% abcd_perm_gen.m +% Written by Nikhil Goyal, National Institute of Mental Health, 2019-2020 +% Created: 7/1/2020 +% Modified: 7/6/20 (made the script into a function) +% 7/7/20 (updated the palm_quickperms call to make it faster) + +% n_subs = name of subset folder (e.x. 1000) where the data is/should be saved; value is equal to the number of subjects under analysis + +% Example call: +% abcd_perm_gen(100000, "/data/ABCD_MBDU/goyaln2/abcd_cca_replication/", 1000) + + +function abcd_perm_gen(N_perm, abcd_cca_dir, n_subs) + if nargin<3 + sprintf("ERROR, not enough arguments.") + sprintf("Example: abcd_perm_gen(100000, '/data/ABCD_MBDU/goyaln2/abcd_cca_replication/', 1000)") + return + end + + if ~isdeployed + addpath(genpath(sprintf('%s/dependencies/', abcd_cca_dir))); + addpath(genpath(sprintf('%s/data/', abcd_cca_dir))); + end + + % Generate permutations using the hcp2blocks package + in_VARS = sprintf('%s/data/%d/VARS.txt', abcd_cca_dir, n_subs); + blocksfile = sprintf('%s/data/%d/blocksfile.csv', abcd_cca_dir, n_subs); + [EB,tab] = abcd2blocks(in_VARS, blocksfile, [100 10]); + + % NOTE, call palm_quickperms with the following options (for fastest calculations while also finding permutations proper) + % [Pset,VG] = palm_quickperms(M,EB,P,EE,ISE,CMCx,CMCp) + % M = [] (no design matrix + % EB = exchangability blocks from abcd2blocks calc + % N_perm = number of permutations + % EE = true (so we do permutations proper) + % ISE = false (so we don't do sign flips) + % CMCx = true (this only applies for design matix, so we set to true) + % CMCp = true (allow duplicate permutations, for speed and since we have nearly infinite possible permutations) + Pset=palm_quickperms([ ], EB, N_perm, true, false, true, true); + % Note, Pset is the final matrix of permuations (one permutation per column) + + % Now save Pset to file + writematrix(Pset, sprintf('%s/data/%d/Pset.txt', abcd_cca_dir, n_subs)); +end \ No newline at end of file