0001 function [w,y]=nt_outliers(x,w,thresh,niter)
0002
0003
0004
0005
0006
0007
0008
0009
0010
0011
0012
0013
0014 nt_greetings;
0015
0016 PCA_THRESH=10^-15;
0017 nClosest=min(10,size(x,2)-1);
0018
0019
0020 if nargin<1; error('!'); end
0021 if nargin<2||isempty(w); w=ones(size(x)); end
0022 if nargin<3||isempty(thresh); thresh=2; end
0023 if nargin<4||isempty(niter); niter=4; end
0024 if ndims(x)>2; error('!'); end
0025 if ~all(size(x)==size(w)); error('!'); end
0026 [nsamples,nchan]=size(x);
0027
0028
0029
0030 On each iteration we reconstruct each channels based on other channels and
0031 then flag the parts that mismatch as corrupt.
0032
0033
0034 x0=x;
0035
0036 figure(200); clf
0037 for iIter=1:niter
0038
0039
0040 We have multichannel data x and a multichannel weighting function w (0 or
0041 1). There are many configurations of valid/invalid channels to consider.
0042 List them.
0043
0044
0045 [ww,nOccurrences,iBack]=patternDict(w);
0046 nPatterns=size(ww,1);
0047
0048
0049 Now we have a list of all the different weight patterns: ww. The
0050 vector iBack indicates which data samples fit each pattern: w = ww(iBack,:).
0051
0052
0053
0054 Find which channels are 'neighbors' in terms of covariance.
0055
0056
0057
0058 [x,save_mean]=nt_demean(x0,w);
0059 [x,save_amp]=nt_normcol(x,w);
0060 xx=x.*w;
0061 c=(xx'*xx) ./ (w'*w); clear xx;
0062 c=abs(c);
0063 sims=c+10*eye(size(c));
0064
0065 y=x;
0066
0067
0068 We now have a matrix indicating proximity between channels.
0069
0070
0071
0072 For each channel, we calculate the projection matrix on the the subspace spanned
0073 by other *valid* channels. There are as many projection matrices as patterns
0074 of valid/invalid channels. Each projection matrix is estimated on data samples for
0075 which iChan is valid, and can be used to reconstruct data samples for which it is
0076 invalid.
0077
0078
0079 for iChan=1:nchan
0080
0081
0082 We want to avoid having to consider all patterns of valid/unvalid
0083 other channels. For that we'll group patterns.
0084 First we order the other channels by decreasing similarity, putting
0085 invalid samples last. This needs to be done for each pattern.
0086
0087
0088 sim=sims(iChan,:);
0089 sim=repmat(sim,nPatterns,1);
0090 sim((~ww))=0;
0091 [~,closest]=sort(abs(sim),2,'descend');
0092 for k=1:size(closest,1);
0093 closest(k,find(sim(k,closest(k,:))==0))=0;
0094 end
0095 for k=1:size(closest,1);
0096 if closest(k,1)==iChan;
0097 closest(k,1:end-1)=closest(k,2:end);
0098 else
0099
0100 end
0101 end
0102 closest=closest(:,1:end-1);
0103
0104
0105 We now have, for each pattern, a list of channels closest to iChan.
0106 There are a lot of different patterns, so we merge those for which the nClosest
0107 channels are the same.
0108
0109
0110
0111 [C,IA,IC]=unique(closest(:,1:nClosest),'rows');
0112 iBack2=IC(iBack);
0113
0114
0115 We now have a smaller array C of reduced patterns. The
0116 vector iBack2 indicates which data samples correspond to each pattern.
0117
0118
0119
0120 For some patterns, only a few samples are invalid. Skip these
0121 and iterpolate later using serial interpolation.
0122
0123
0124
0125 toFix=[];
0126 NSKIP=4;
0127 www=ones(size(x,1),1);
0128 for iPattern=1:size(C,1)
0129 mySamples=find(iBack2==iPattern);
0130 if numel(mySamples)<=NSKIP
0131 www(mySamples)=0;
0132 else
0133 toFix=[toFix,iPattern];
0134 end
0135 end
0136 C=C(toFix,:);
0137
0138
0139
0140 for iPattern=1:size(C,1)
0141
0142
0143 Estimate matrix to project iChan on the other channels listed in this
0144 pattern.
0145
0146
0147
0148 oChan=C(iPattern,:);
0149
0150 oChan(find(oChan==0))=[];
0151
0152 if any(iChan==oChan); error('!'); end
0153
0154
0155 mySamples=find(iBack2==toFix(iPattern));
0156
0157
0158 iBothValid=all(w(:,[iChan,oChan]),2);
0159 xxx=x(iBothValid, [iChan,oChan]);
0160
0161
0162
0163 if isempty(xxx);
0164 disp([iChan, iPattern]); disp('empty');
0165 continue;
0166 end
0167
0168
0169 mn=mean(xxx,1);
0170 xxx=nt_demean(xxx);
0171 ccc=xxx'*xxx;
0172
0173
0174 [topcs,eigenvalues]=nt_pcarot(ccc(2:end,2:end));
0175 idx=find(eigenvalues/max(eigenvalues) > PCA_THRESH);
0176 topcs=topcs(:,idx);
0177
0178
0179 b=ccc(1,2:end)*topcs / (topcs'*ccc(2:end,2:end)*topcs);
0180
0181
0182 We now have the projection matrix to project channel iChan on channels oChan,
0183 applicable to samples corresponding to this pattern. We can use it
0184 to fix samples for which iChan is invalid.
0185
0186
0187 y(mySamples,iChan) = ...
0188 (x(mySamples,oChan) - repmat(mn(2:end),numel(mySamples),1))...
0189 *(topcs*b') ...
0190 + mn(1);
0191
0192 end
0193
0194
0195 Now we fix the isolated samples that we skipped, using serial interpolation.
0196
0197
0198 MAXGAPSIZE=100;
0199 y(:,iChan)=fillgap(y(:,iChan),www,MAXGAPSIZE);
0200
0201 end
0202
0203
0204 y=bsxfun(@times,y,save_amp);
0205 y=bsxfun(@plus,y,save_mean);
0206
0207 v=min(std(x0(find(w))),std(y(find(w))));
0208 d = abs(y-x0);
0209 w=double( (d/v < thresh) );
0210 score=mean( (d.*w)) ./ mean(w);
0211 disp([num2str(iIter), ', score: ',num2str(sum(score))]);
0212
0213 ch=33; FOCUS=1:size(x,1);
0214 figure(200);
0215
0216
0217 plot(score); hold on; drawnow
0218
0219 end
0220
0221
0222 if ~nargout
0223
0224 disp(nt_wpwr(y)/nt_wpwr(x));
0225 figure(11); clf;
0226 subplot 311; plot(x0); title('raw'); xlim([1 size(x0,1)]);
0227 subplot 312; plot(y); title('projected on other channels'); xlim([1 size(x0,1)]);
0228 subplot 313; nt_imagescc(w'); title('weight');
0229 clear w
0230 end
0231
0232
0233
0234 function [y,w]=fillgap(x,w,maxGapSize)
0235
0236
0237
0238
0239
0240
0241
0242 if nargin<2; error('!'); end
0243 if nargin<3||isempty(maxGapSize); maxGapSize=1; end
0244 if size(x,2)>1; error('!'); end
0245 if size(x) ~= size(w); error('!'); end
0246 y=x;
0247 if all(w); return; end
0248
0249 iToFix=1+find(~w(2:end-1)&w(1:end-2)&w(3:end));
0250 y(iToFix)=(y(iToFix-1)+y(iToFix+1))/2;
0251 w(iToFix)=1;
0252
0253 iStart=find(w(1:end-2) & ~w(2:end-1));
0254 iStop=find(~w(1:end-1) & w(2:end));
0255 if isempty(iStart)||isempty(iStop); return; end
0256 if iStop(1)<iStart(1);
0257 iStop=iStop(2:end);
0258 end
0259 iStart=iStart(1:numel(iStop));
0260 for gapSize=2:maxGapSize
0261 idx=find(iStop-iStart==gapSize);
0262 for k=1:gapSize
0263
0264 y(iStart(idx)+k) = ( y(iStart(idx)) * (gapSize-k+1) + y(iStart(idx)+gapSize+1) * k ) / (gapSize+1);
0265 w(iStart(idx)+k) = 1;
0266 end
0267 end
0268
0269
0270
0271
0272 function [ww,nOccurrences,iBack]=patternDict(w)
0273
0274
0275
0276 [ww,~,IC,nOccurrences]=nt_unique(w,'rows');
0277 [nOccurrences,iSort]=sort(nOccurrences, 'descend');
0278 [~,iReverse]=sort(iSort);
0279 ww=ww(iSort,:);
0280 iBack=iReverse(IC);
0281
0282
0283 if 0
0284 nsources=3;
0285 x0=sin(2*pi*(1:10000)'*(1:nsources)/10000);
0286 x=x0*nt_normcol(randn(nsources,6));
0287 w=ones(size(x));
0288 x(1:1000,1)=10; w(1:1000,1)=0;
0289 x(2001:3000,1)=10; w(2001:3000,1)=0;
0290 x(1:2000,2)=10; w(1:2000,2)=0;
0291
0292
0293 [ww,yy]=nt_outliers(x,[],1);
0294 figure(1); clf
0295 subplot 311; plot(x); title('raw'); subplot 312; plot(yy); title('interpolated'); subplot 313; nt_imagescc(w'); title('weights');
0296 end
0297 if 0
0298 N=20;
0299 nchans=50;
0300 x=zeros(1100,N);
0301 for k=1:N
0302 x(:,k)=sin(2*pi*k*(1:1100)'/1100);
0303 end
0304 x=x*randn(N,nchans);
0305 x=nt_normcol(x) + 0*randn(size(x));
0306 w=ones(size(x));
0307 x0=x;
0308 a=30
0309 for k=1:nchans
0310 x(k*20+(1:a),k)=20*randn(a,1);
0311 w(k*20+(1:a),k)=0;
0312 end
0313 [ww,yy]=nt_outliers(x,[],1);
0314 figure(1); clf
0315 subplot 311; plot(x); title('raw');
0316 subplot 312; plot(yy); title('interpolated'); subplot 313; nt_imagescc(ww'); title('weights');
0317 end
0318 if 0
0319 N=10;
0320 nchans=20;
0321 nsamples=1100;
0322 x=zeros(nsamples,N);
0323 for k=1:N
0324 x(:,k)=sin(2*pi*k*(1:nsamples)'/nsamples);
0325 end
0326 x=x*randn(N,nchans);
0327
0328 w=ones(size(x));
0329 x0=x;
0330 for k=1:nchans
0331 x(500+k*20+(1:40),k)=10;
0332 w(500+k*20+(1:40),k)=0;
0333 end
0334 [ww,y]=nt_outliers(x,[],1);
0335 figure(1); clf;
0336 subplot 412; plot(x); title ('raw');
0337 subplot 413; plot (y); title ('interpolated');
0338 subplot 414; nt_imagescc(ww'); title ('weights');
0339 end
0340 if 0
0341 [p,x]=nt_read_data('/data/meg/theoldmanandthesea/eeg/mc/MC_aespa_speech_45.mat'); x=x'; x=x(:,1:128); x=x(0000+(1:6000),:);
0342
0343
0344 x=nt_demean(x);
0345 [x,w]=nt_detrend(x,3,[],[],3);
0346 profile on; y=nt_inpaint(x,w); profile report;
0347 figure(1); clf
0348 subplot 311; plot(x); title('raw'); subplot 312; plot(y); title('clean'); subplot 313; plot(x-y); title('raw-clean');
0349 figure(2); clf
0350 ch=35;subplot 311; plot([x(:,ch),y(:,ch)]); subplot 312; plot(x(:,ch)-y(:,ch)); subplot 313; plot(w(:,ch), '.-');
0351 [ww,yy]=nt_outliers(x,[],1); drawnow;
0352 figure(4); clf;
0353 subplot 311; plot(x);
0354 subplot 312; plot(yy);
0355 subplot 313; nt_imagescc(ww');
0356 end
0357
0358