0001 function [w,y]=nt_outliers(x,w,thresh,niter)
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014 nt_greetings;
0015
0016 PCA_THRESH=10^-15;
0017 nClosest=min(10,size(x,2)-1);
0018
0019
0020 if nargin<1; error('!'); end
0021 if nargin<2||isempty(w); w=ones(size(x)); end
0022 if nargin<3||isempty(thresh); thresh=2; end
0023 if nargin<4||isempty(niter); niter=4; end
0024 if ndims(x)>2; error('!'); end
0025 if ~all(size(x)==size(w)); error('!'); end
0026 [nsamples,nchan]=size(x);
0027
0028
0029
0030 On each iteration we reconstruct each channels based on other channels and
0031 then flag the parts that mismatch as corrupt.
0032
0033 x0=x;
0034
0035 figure(200); clf
0036 for iIter=1:niter
0037
0038
0039 We have multichannel data x and a multichannel weighting function w (0 or
0040 1). There are many configurations of valid/invalid channels to consider.
0041 List them.
0042
0043
0044 [ww,nOccurrences,iBack]=patternDict(w);
0045 nPatterns=size(ww,1);
0046
0047
0048 Now we have a list of all the different weight patterns: ww. The
0049 vector iBack indicates which data samples fit each pattern: w = ww(iBack,:).
0050
0051
0052
0053 Find which channels are 'neighbors' in terms of covariance.
0054
0055
0056
0057 [x,save_mean]=nt_demean(x0,w);
0058 [x,save_amp]=nt_normcol(x,w);
0059 xx=x.*w;
0060 c=(xx'*xx) ./ (w'*w); clear xx;
0061 c=abs(c);
0062 sims=c+10*eye(size(c));
0063
0064 y=x;
0065
0066
0067 We now have a matrix indicating proximity between channels.
0068
0069
0070
0071 For each channel, we calculate the projection matrix on the the subspace spanned
0072 by other *valid* channels. There are as many projection matrices as patterns
0073 of valid/invalid channels. Each projection matrix is estimated on data samples for
0074 which iChan is valid, and can be used to reconstruct data samples for which it is
0075 invalid.
0076
0077
0078 for iChan=1:nchan
0079
0080
0081 We want to avoid having to consider all patterns of valid/unvalid
0082 other channels. For that we'll group patterns.
0083 First we order the other channels by decreasing similarity, putting
0084 invalid samples last. This needs to be done for each pattern.
0085
0086
0087 sim=sims(iChan,:);
0088 sim=repmat(sim,nPatterns,1);
0089 sim((~ww))=0;
0090 [~,closest]=sort(abs(sim),2,'descend');
0091 for k=1:size(closest,1);
0092 closest(k,find(sim(k,closest(k,:))==0))=0;
0093 end
0094 for k=1:size(closest,1);
0095 if closest(k,1)==iChan;
0096 closest(k,1:end-1)=closest(k,2:end);
0097 else
0098
0099 end
0100 end
0101 closest=closest(:,1:end-1);
0102
0103
0104 We now have, for each pattern, a list of channels closest to iChan.
0105 There are a lot of different patterns, so we merge those for which the nClosest
0106 channels are the same.
0107
0108
0109
0110 [C,IA,IC]=unique(closest(:,1:nClosest),'rows');
0111 iBack2=IC(iBack);
0112
0113
0114 We now have a smaller array C of reduced patterns. The
0115 vector iBack2 indicates which data samples correspond to each pattern.
0116
0117
0118
0119 For some patterns, only a few samples are invalid. Skip these
0120 and iterpolate later using serial interpolation.
0121
0122
0123
0124 toFix=[];
0125 NSKIP=4;
0126 www=ones(size(x,1),1);
0127 for iPattern=1:size(C,1)
0128 mySamples=find(iBack2==iPattern);
0129 if numel(mySamples)<=NSKIP
0130 www(mySamples)=0;
0131 else
0132 toFix=[toFix,iPattern];
0133 end
0134 end
0135 C=C(toFix,:);
0136
0137
0138
0139 for iPattern=1:size(C,1)
0140
0141
0142 Estimate matrix to project iChan on the other channels listed in this
0143 pattern.
0144
0145
0146
0147 oChan=C(iPattern,:);
0148
0149 oChan(find(oChan==0))=[];
0150
0151 if any(iChan==oChan); error('!'); end
0152
0153
0154 mySamples=find(iBack2==toFix(iPattern));
0155
0156
0157 iBothValid=all(w(:,[iChan,oChan]),2);
0158 xxx=x(iBothValid, [iChan,oChan]);
0159
0160
0161
0162 if isempty(xxx);
0163 disp([iChan, iPattern]); disp('empty');
0164 continue;
0165 end
0166
0167
0168 mn=mean(xxx,1);
0169 xxx=nt_demean(xxx);
0170 ccc=xxx'*xxx;
0171
0172
0173 [topcs,eigenvalues]=nt_pcarot(ccc(2:end,2:end));
0174 idx=find(eigenvalues/max(eigenvalues) > PCA_THRESH);
0175 topcs=topcs(:,idx);
0176
0177
0178 b=ccc(1,2:end)*topcs / (topcs'*ccc(2:end,2:end)*topcs);
0179
0180
0181 We now have the projection matrix to project channel iChan on channels oChan,
0182 applicable to samples corresponding to this pattern. We can use it
0183 to fix samples for which iChan is invalid.
0184
0185
0186 y(mySamples,iChan) = ...
0187 (x(mySamples,oChan) - repmat(mn(2:end),numel(mySamples),1))...
0188 *(topcs*b') ...
0189 + mn(1);
0190
0191 end
0192
0193
0194 Now we fix the isolated samples that we skipped, using serial interpolation.
0195
0196
0197 MAXGAPSIZE=100;
0198 y(:,iChan)=fillgap(y(:,iChan),www,MAXGAPSIZE);
0199
0200 end
0201
0202
0203 y=bsxfun(@times,y,save_amp);
0204 y=bsxfun(@plus,y,save_mean);
0205
0206 v=min(std(x0(find(w))),std(y(find(w))));
0207 d = abs(y-x0);
0208 w=double( (d/v < thresh) );
0209 score=mean( (d.*w)) ./ mean(w);
0210 disp([num2str(iIter), ', score: ',num2str(sum(score))]);
0211
0212 ch=33; FOCUS=1:size(x,1);
0213 figure(200);
0214
0215
0216 plot(score); hold on; drawnow
0217
0218 end
0219
0220
0221 if ~nargout
0222
0223 disp(nt_wpwr(y)/nt_wpwr(x));
0224 figure(11); clf;
0225 subplot 311; plot(x0); title('raw'); xlim([1 size(x0,1)]);
0226 subplot 312; plot(y); title('projected on other channels'); xlim([1 size(x0,1)]);
0227 subplot 313; nt_imagescc(w'); title('weight');
0228 clear w
0229 end
0230
0231
0232
0233 function [y,w]=fillgap(x,w,maxGapSize)
0234
0235
0236
0237
0238
0239
0240
0241 if nargin<2; error('!'); end
0242 if nargin<3||isempty(maxGapSize); maxGapSize=1; end
0243 if size(x,2)>1; error('!'); end
0244 if size(x) ~= size(w); error('!'); end
0245 y=x;
0246 if all(w); return; end
0247
0248 iToFix=1+find(~w(2:end-1)&w(1:end-2)&w(3:end));
0249 y(iToFix)=(y(iToFix-1)+y(iToFix+1))/2;
0250 w(iToFix)=1;
0251
0252 iStart=find(w(1:end-2) & ~w(2:end-1));
0253 iStop=find(~w(1:end-1) & w(2:end));
0254 if isempty(iStart)||isempty(iStop); return; end
0255 if iStop(1)<iStart(1);
0256 iStop=iStop(2:end);
0257 end
0258 iStart=iStart(1:numel(iStop));
0259 for gapSize=2:maxGapSize
0260 idx=find(iStop-iStart==gapSize);
0261 for k=1:gapSize
0262
0263 y(iStart(idx)+k) = ( y(iStart(idx)) * (gapSize-k+1) + y(iStart(idx)+gapSize+1) * k ) / (gapSize+1);
0264 w(iStart(idx)+k) = 1;
0265 end
0266 end
0267
0268
0269
0270
0271 function [ww,nOccurrences,iBack]=patternDict(w)
0272
0273
0274
0275 [ww,~,IC,nOccurrences]=nt_unique(w,'rows');
0276 [nOccurrences,iSort]=sort(nOccurrences, 'descend');
0277 [~,iReverse]=sort(iSort);
0278 ww=ww(iSort,:);
0279 iBack=iReverse(IC);
0280
0281
0282 if 0
0283 nsources=3;
0284 x0=sin(2*pi*(1:10000)'*(1:nsources)/10000);
0285 x=x0*nt_normcol(randn(nsources,6));
0286 w=ones(size(x));
0287 x(1:1000,1)=10; w(1:1000,1)=0;
0288 x(2001:3000,1)=10; w(2001:3000,1)=0;
0289 x(1:2000,2)=10; w(1:2000,2)=0;
0290
0291
0292 [ww,yy]=nt_outliers(x,[],1);
0293 figure(1); clf
0294 subplot 311; plot(x); title('raw'); subplot 312; plot(yy); title('interpolated'); subplot 313; nt_imagescc(w'); title('weights');
0295 end
0296 if 0
0297 N=20;
0298 nchans=50;
0299 x=zeros(1100,N);
0300 for k=1:N
0301 x(:,k)=sin(2*pi*k*(1:1100)'/1100);
0302 end
0303 x=x*randn(N,nchans);
0304 x=nt_normcol(x) + 0*randn(size(x));
0305 w=ones(size(x));
0306 x0=x;
0307 a=30
0308 for k=1:nchans
0309 x(k*20+(1:a),k)=20*randn(a,1);
0310 w(k*20+(1:a),k)=0;
0311 end
0312 [ww,yy]=nt_outliers(x,[],1);
0313 figure(1); clf
0314 subplot 311; plot(x); title('raw');
0315 subplot 312; plot(yy); title('interpolated'); subplot 313; nt_imagescc(ww'); title('weights');
0316 end
0317 if 0
0318 N=10;
0319 nchans=20;
0320 nsamples=1100;
0321 x=zeros(nsamples,N);
0322 for k=1:N
0323 x(:,k)=sin(2*pi*k*(1:nsamples)'/nsamples);
0324 end
0325 x=x*randn(N,nchans);
0326
0327 w=ones(size(x));
0328 x0=x;
0329 for k=1:nchans
0330 x(500+k*20+(1:40),k)=10;
0331 w(500+k*20+(1:40),k)=0;
0332 end
0333 [ww,y]=nt_outliers(x,[],1);
0334 figure(1); clf;
0335 subplot 412; plot(x); title ('raw');
0336 subplot 413; plot (y); title ('interpolated');
0337 subplot 414; nt_imagescc(ww'); title ('weights');
0338 end
0339 if 0
0340 [p,x]=nt_read_data('/data/meg/theoldmanandthesea/eeg/mc/MC_aespa_speech_45.mat'); x=x'; x=x(:,1:128); x=x(0000+(1:6000),:);
0341
0342
0343 x=nt_demean(x);
0344 [x,w]=nt_detrend(x,3,[],[],3);
0345 profile on; y=nt_inpaint(x,w); profile report;
0346 figure(1); clf
0347 subplot 311; plot(x); title('raw'); subplot 312; plot(y); title('clean'); subplot 313; plot(x-y); title('raw-clean');
0348 figure(2); clf
0349 ch=35;subplot 311; plot([x(:,ch),y(:,ch)]); subplot 312; plot(x(:,ch)-y(:,ch)); subplot 313; plot(w(:,ch), '.-');
0350 [ww,yy]=nt_outliers(x,[],1); drawnow;
0351 figure(4); clf;
0352 subplot 311; plot(x);
0353 subplot 312; plot(yy);
0354 subplot 313; nt_imagescc(ww');
0355 end
0356
0357