forked from andrewssobral/nway
-
Notifications
You must be signed in to change notification settings - Fork 0
/
ncrossreg.m
309 lines (265 loc) · 8.79 KB
/
ncrossreg.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
function [XValResult,Model]=ncrossreg(X,y,MaxFac,Centering,SegmentsID);
%NCROSSREG cross-validation of regression model
%
% See also:
% 'ncrossdecomp'
%
% CROSS-VALIDATION OF BI- & MULTILINEAR REGRESSION MODELS
% Performs cross-validation of
% - NPLS Input multi-way array X
% - PLS Input two-way X
%
% The data are by default centered across the first mode, but no scaling
% is applied (this must be done beforehand)
%
% I/O
% [XValResult,Model]=ncrossreg(X,y,MaxFac,Centering);
%
% INPUT
% X : X array
% y : y array
% MaxFac : Maximal number of factors (from one to MaxFac factors will be investigated)
%
% OPTIONAL INPUT
% Centering : If not zero, centering is performed on every segment
% SegmentsID: Optional binary matrix. Rows as rows in X and i'th column defines i'th segment
% Rows in i'th column set to one are left out at
%
% OUTPUT
% XValResult
% Structured array with the following elements
% ypred : Cross-validated predictions
% ssX_Xval : Cross-validated sum of squares of residuals in X (f'th element for f-component model)
% ssX_Fit : Fitted sum of squares of residuals in X (f'th element for f-component model)
% ssY_Xval : Cross-validated sum of squares of residuals in Y (f'th element for f-component model)
% ssY_Fit : Fitted sum of squares of residuals in Y (f'th element for f-component model)
% Percent : Structured array holding Xexp and Yexp, each with fitted and X-validated % Variance captured.
% PRESS : Predicted REsidual Sum of Squares in Y
% RMSEP : Root Mean Square Error of Prediction (cross-validation)
%
% Model
% Structured array holding the NPLS model
% $ Version 1.0301 $ Date 26. June 1999
% $ Version 1.0302 $ Date 1. January 2000
% $ Version 2.00 $ May 2001 $ Changed to array notation $ RB $ Not compiled $
% $ Version 2.01 $ jan 2003 $ Added option for skipping centering and added percentages in output $ RB $ Not compiled $
% $ Version 2.02 $ Sep 2003 $ Fixed bug in non-center option $ RB $ Not compiled $
% Copyright (C) 1995-2006 Rasmus Bro & Claus Andersson
% Copenhagen University, DK-1958 Frederiksberg, Denmark, [email protected]
%
I = size(X,1);
DimX = size(X);
DimY = size(y);
X = reshape(X,DimX(1),prod(DimX(2:end)));
y = reshape(y,DimY(1),prod(DimY(2:end)));
Ypred = zeros([MaxFac DimY]);
Ex = zeros([MaxFac DimX]);
Ey = zeros([MaxFac DimY]);
if nargin<4
Centering = 1;
elseif isempty(Centering)
Centering = 1;
end
%%%%%%%%%%%%%%%%%
%%MAKE SEGMENTS%%
%%%%%%%%%%%%%%%%%
if exist('SegmentsID')~=1
SegmentsID = MakeSegments(I);
end
%%%%%%%%%%%%%%%%%%%
%%MAKE SUB-MODELS%%
%%%%%%%%%%%%%%%%%%%
for i=1:size(SegmentsID,2)
In = find(~SegmentsID(:,i));
Out = find(SegmentsID(:,i));
% Offsets
if Centering
Mx = nanmean(X(In,:));
My = nanmean(y(In,:));
else
Mx = zeros(1,prod(DimX(2:end)));
My = zeros(1,prod(DimY(2:end)));
end
%Centered data
Xc = X(In,:)-repmat(Mx,length(In),1);
yc = y(In,:)-repmat(My,length(In),1);
% %Centered data
% Xc = X(In,:)-ones(length(In),1)*Mx;
% yc = y(In,:)-ones(length(In),1)*My;
% Calculate model
DimXc = DimX;DimXc(1)=size(Xc,1);
Xc = reshape(Xc,DimXc);
DimYc = DimY;DimYc(1)=size(yc,1);
yc = reshape(yc,DimYc);
[Xfactors,Yfactors,Core,B] = npls(Xc,yc,MaxFac,NaN);
%Predict left-out samples
for f=1:MaxFac
Xc = X(Out,:)-ones(length(Out),1)*Mx;
DimXc = DimX;
DimXc(1)=size(Xc,1);
Xc = reshape(Xc,DimXc);
[ypr,T,ssx,Xres] = npred(Xc,f,Xfactors,Yfactors,Core,B,NaN);
Ex(f,Out,:) = reshape(Xres,DimXc(1),prod(DimXc(2:end)));
Ypred(f,Out,:) = ypr+ones(length(Out),1)*My;
Ypredf = squeeze(Ypred(f,:,:));
if size(y,2) == 1
YpredfOut=Ypredf(Out);
else
YpredfOut=Ypredf(Out,:);
end
%size(Ey(f,Out,:)),size(y(Out,:)),size(YpredfOut)
if size(y,2)==1
Ey(f,Out,:) = squeeze(y(Out,:))'-squeeze(YpredfOut);
else
Ey(f,Out,:) = squeeze(y(Out,:))-squeeze(YpredfOut);
end
end
end
if Centering
Mx = nanmean(X(In,:));
My = nanmean(y(In,:));
else
Mx = zeros(1,prod(DimX(2:end)));
My = zeros(1,prod(DimY(2:end)));
end
%Centered data
Xc = X-repmat(Mx,size(X,1),1);
yc = y-repmat(My,size(y,1),1);
%%Centered data
%Xc = X-ones(I,1)*Mx;
%yc = y-ones(I,1)*My;
[Xfactors,Yfactors,Core,B,ypred,ssx,ssy] = npls(reshape(Xc,DimX),reshape(yc,DimY),MaxFac,NaN);
Model.Xfactors = Xfactors;
Model.Yfactors = Yfactors;
Model.Core = Core;
Model.B = B;
Model.Yfitted = ypred;
Model.MeanX = Mx;
Model.MeanY = My;
sseX_fit = ssx(2:end,1);
sseY_fit = ssy(2:end,1);
for f=1:MaxFac
id=find(~isnan(Ex(f,:)));sseX_xval(f) = sum(Ex(f,id).^2);
id=find(~isnan(Ey(f,:)));sseY_xval(f) = sum(Ey(f,id).^2);
PRESS(f) = sum(Ey(f,id).^2);
end
RMSEP = sqrt(PRESS/I);
Xval = [sseX_fit sseX_xval'];
Yval = [sseY_fit sseY_xval'];
Xval = 100*(1-Xval/sum(Xc(find(~isnan(X))).^2));
Yval = 100*(1-Yval/sum(yc(find(~isnan(y))).^2));
XValResult.ypred = Ypred;
XValResult.ssX_Xval = sseX_xval;
XValResult.ssX_Fit = sseX_fit';
XValResult.ssY_Xval = sseY_xval;
XValResult.ssY_Fit = sseY_fit';
XValResult.Percent.Xexp = Xval;
XValResult.Percent.Yexp = Yval;
XValResult.PRESS = PRESS;
XValResult.RMSEP = RMSEP;
XValResult.DefSegments = sparse(SegmentsID);
disp(' ')
disp(' Percent Variance Captured by N-PLS Model ')
disp(' ')
disp(' -----X-Block----- -----Y-Block-----')
disp(' LV # Fitted Xval Fitted Xval RMSEP')
disp(' ---- ------- ------- ------- ------- ---------')
format = ' %3.0f %6.2f %6.2f %6.2f %6.2f %6.4f';
for f = 1:MaxFac
tab = sprintf(format,[f Xval(f,:) Yval(f,:) RMSEP(f)]);
disp(tab)
end
disp(' ')
function SegmentsID = MakeSegments(I);
XvalMeth=questdlg('Which type of validation do you want to perform (ENTER => full Xval)?','Choose validation','Full X-validation','Segmented','Prespecified','Full X-validation');
switch XvalMeth
case 'Full X-validation'
SegmentsID = speye(I);
case 'Segmented'
prompt={'Enter the number of segments:'};
eval(['def={''',num2str(min(I,max(3,round(I/7)))),'''};'])
dlgTitle='Number of segments';
lineNo=1;
answer=inputdlg(prompt,dlgTitle,lineNo,def);
NumbSegm=eval(answer{1});
% Make sure the number of segments is OK
while NumbSegm<2|NumbSegm>I
prompt={'INCONSISTENT NUMBER CHOSEN (must be > 1 and <= samples)'};
eval(['def={''',num2str(min(I,max(3,round(I/7)))),'''};'])
dlgTitle='Number of segments';
lineNo=1;
answer=inputdlg(prompt,dlgTitle,lineNo,def);
NumbSegm=eval(answer{1})
NumbSegm<2|NumbSegm>I
end
XvalSegm=questdlg('How should segments be chosen?','Choose segmentation','111222333...','123123123...','Random','123123123...');
switch XvalSegm
case '111222333...'
SegmentsID = sparse(I,NumbSegm);
NumbInEachSegm = floor(I/NumbSegm);
Additional = I-NumbInEachSegm*NumbSegm;
currentsample = 1;
for i=1:NumbSegm
if i <=Additional
add = NumbInEachSegm+1;
elseif i<NumbSegm
add = NumbInEachSegm;
else
add = I-currentsample+1;
end
SegmentsID(currentsample:currentsample+add-1,i)=1;
currentsample = currentsample + add;
end
case '123123123...'
SegmentsID = sparse(I,NumbSegm);
NumbInEachSegm = floor(I/NumbSegm);
for i=1:NumbSegm
SegmentsID(i:NumbSegm:end,i)=1;
end
case 'Random'
% Make nonrandom and then randomize order
SegmentsID = sparse(I,NumbSegm);
NumbInEachSegm = floor(I/NumbSegm);
for i=1:NumbSegm
SegmentsID(i:NumbSegm:end,i)=1;
end
rand('state',sum(100*clock)) %Randomize randomizer
[a,b] = sort(rand(I,1))
SegmentsID = SegmentsID(b,:);
end
case 'Prespecified'
prompt={'Enter the name of the file defining the subsets'};
def={'SegmentsID'};
dlgTitle='Import definition';
lineNo=1;
answer=inputdlg(prompt,dlgTitle,lineNo,def);
SegmentsID=eval(answer{1});
end % switch
function y = nanmean(x)
%NANMEAN Average or mean ignoring NaNs.
% NANMEAN(X) returns the average treating NaNs as missing values.
% For vectors, NANMEAN(X) is the mean value of the non-NaN
% elements in X. For matrices, NANMEAN(X) is a row vector
% containing the mean value of each column, ignoring NaNs.
%
% See also NANMEDIAN, NANSTD, NANMIN, NANMAX, NANSUM.
% Copyright (c) 1993-98 by The MathWorks, Inc.
% $Revision: 2.8 $ $Date: 1997/11/29 01:45:53 $
if isempty(x) % Check for empty input.
y = NaN;
return
end
% Replace NaNs with zeros.
nans = isnan(x);
i = find(nans);
x(i) = zeros(size(i));
if min(size(x))==1,
count = length(x)-sum(nans);
else
count = size(x,1)-sum(nans);
end
% Protect against a column of all NaNs
i = find(count==0);
count(i) = ones(size(i));
y = sum(x)./count;
y(i) = i + NaN;