Home > NoiseTools > nt_outliers.m

nt_outliers

PURPOSE ^

[w,y]=nt_outliers(x,w,thresh,niter) - detect outliers based on weighted correlation structure

SYNOPSIS ^

function [w,y]=nt_outliers(x,w,thresh,niter)

DESCRIPTION ^

[w,y]=nt_outliers(x,w,thresh,niter) - detect outliers based on weighted correlation structure

  w: weights (indicates outliers)
  y: interpolated data

  x: data matrix
  w: initial weights
  thresh: threshold for declaring an outlier [default: 2]
  niter: number of iterations [default: 3]


 Noisetools.

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001  function [w,y]=nt_outliers(x,w,thresh,niter)
0002 %[w,y]=nt_outliers(x,w,thresh,niter) - detect outliers based on weighted correlation structure
0003 %
0004 %  w: weights (indicates outliers)
0005 %  y: interpolated data
0006 %
0007 %  x: data matrix
0008 %  w: initial weights
0009 %  thresh: threshold for declaring an outlier [default: 2]
0010 %  niter: number of iterations [default: 3]
0011 %
0012 %
0013 % Noisetools.
0014 nt_greetings;
0015 
0016 PCA_THRESH=10^-15;
0017 nClosest=min(10,size(x,2)-1);
0018 %nClosest=40;
0019 
0020 if nargin<1; error('!'); end
0021 if nargin<2||isempty(w); w=ones(size(x)); end
0022 if nargin<3||isempty(thresh); thresh=2; end
0023 if nargin<4||isempty(niter); niter=4; end
0024 if ndims(x)>2; error('!'); end
0025 if ~all(size(x)==size(w)); error('!'); end
0026 [nsamples,nchan]=size(x);
0027 
0028 
0029 %{
0030 On each iteration we reconstruct each channels based on other channels and
0031 then flag the parts that mismatch as corrupt.
0032 
0033 %}
0034 x0=x;
0035 
0036 figure(200); clf
0037 for iIter=1:niter
0038     
0039     %{
0040     We have multichannel data x and a multichannel weighting function w (0 or
0041     1). There are many configurations of valid/invalid channels to consider.
0042     List them.
0043     %}
0044     
0045     [ww,nOccurrences,iBack]=patternDict(w); % all patterns of good/bad channels
0046     nPatterns=size(ww,1);
0047     %disp('npatterns'); disp(nPatterns)
0048     %{
0049     Now we have a list of all the different weight patterns: ww. The
0050     vector iBack indicates which data samples fit each pattern: w = ww(iBack,:).
0051     %}
0052 
0053     %{
0054     Find which channels are 'neighbors' in terms of covariance.
0055     %}
0056 
0057     % weighted covariance matrix to determine which channels are close
0058     [x,save_mean]=nt_demean(x0,w); 
0059     [x,save_amp]=nt_normcol(x,w);
0060     xx=x.*w;
0061     c=(xx'*xx) ./ (w'*w); clear xx;
0062     c=abs(c); 
0063     sims=c+10*eye(size(c)); % make sure self always scores highest so we can skip it
0064 
0065     y=x;  
0066 
0067     %{
0068     We now have a matrix indicating proximity between channels. 
0069     %}
0070 
0071     %{
0072     For each channel, we calculate the projection matrix on the the subspace spanned 
0073     by other *valid* channels.  There are as many projection matrices as patterns 
0074     of valid/invalid channels.  Each projection matrix is estimated on data samples for
0075     which iChan is valid, and can be used to reconstruct data samples for which it is 
0076     invalid.
0077     %}
0078 
0079     for iChan=1:nchan
0080 
0081         %{
0082         We want to avoid having to consider all patterns of valid/unvalid 
0083         other channels. For that we'll group patterns. 
0084         First we order the other channels by decreasing similarity, putting
0085         invalid samples last. This needs to be done for each pattern.
0086         %}
0087 
0088         sim=sims(iChan,:);              % list of similarities with all other channels
0089         sim=repmat(sim,nPatterns,1);    % replicate so each pattern has own list
0090         sim((~ww))=0;                   % for each list, give bad channels a low score
0091         [~,closest]=sort(abs(sim),2,'descend');     % sort each list by decreasing similarity
0092         for k=1:size(closest,1);
0093             closest(k,find(sim(k,closest(k,:))==0))=0;     % mark bad channels as 0
0094         end
0095         for k=1:size(closest,1);
0096             if closest(k,1)==iChan; 
0097                 closest(k,1:end-1)=closest(k,2:end);
0098              else
0099                 % iChan was bad so not first
0100             end
0101         end
0102         closest=closest(:,1:end-1);     % last not valid if first skipped
0103 
0104         %{
0105         We now have, for each pattern, a list of channels closest to iChan. 
0106         There are a lot of different patterns, so we merge those for which the nClosest 
0107         channels are the same.
0108         %}
0109 
0110         % group patterns for which the nClosest most similar channels are the same
0111         [C,IA,IC]=unique(closest(:,1:nClosest),'rows');
0112         iBack2=IC(iBack);       % maps each pattern to the data that fit it
0113 
0114         %{
0115         We now have a smaller array C of reduced patterns. The
0116         vector iBack2 indicates which data samples correspond to each pattern.
0117         %}
0118 
0119         %{
0120         For some patterns, only a few samples are invalid. Skip these
0121         and iterpolate later using serial interpolation.
0122         %}
0123         
0124 
0125         toFix=[];
0126         NSKIP=4; % skip if fewer samples
0127         www=ones(size(x,1),1);
0128         for iPattern=1:size(C,1)
0129             mySamples=find(iBack2==iPattern); 
0130             if numel(mySamples)<=NSKIP
0131                 www(mySamples)=0;
0132             else
0133                 toFix=[toFix,iPattern];
0134             end
0135         end
0136         C=C(toFix,:);
0137 
0138         %disp(size(C,1))
0139 
0140         for iPattern=1:size(C,1)
0141 
0142             %{
0143             Estimate matrix to project iChan on the other channels listed in this
0144             pattern. 
0145             %}
0146 
0147             %disp([iChan iPattern])
0148             oChan=C(iPattern,:);
0149             %disp(oChan)
0150             oChan(find(oChan==0))=[]; % exclude bad channels
0151             
0152             if any(iChan==oChan); error('!'); end
0153 
0154             % samples corresponding to this pattern
0155             mySamples=find(iBack2==toFix(iPattern)); 
0156 
0157             % select data for which iChan *and* oChan are valid
0158             iBothValid=all(w(:,[iChan,oChan]),2);        
0159             xxx=x(iBothValid, [iChan,oChan]);  
0160             %figure(3); clf; plot (iBothValid); title([iChan oChan]);
0161 
0162             %%% --> we should be able to avoid this situation
0163             if isempty(xxx); 
0164                 disp([iChan, iPattern]); disp('empty'); 
0165                 continue; % we won't estimate or fix anything
0166             end
0167 
0168             % calculate covariance matrix
0169             mn=mean(xxx,1);
0170             xxx=nt_demean(xxx); % remove mean first
0171             ccc=xxx'*xxx;
0172 
0173             % PCA other channels to remove weak dimensions
0174             [topcs,eigenvalues]=nt_pcarot(ccc(2:end,2:end));
0175             idx=find(eigenvalues/max(eigenvalues) > PCA_THRESH); % discard weak dims
0176             topcs=topcs(:,idx);
0177 
0178             % projection matrix
0179             b=ccc(1,2:end)*topcs / (topcs'*ccc(2:end,2:end)*topcs);  
0180             
0181             %{
0182             We now have the projection matrix to project channel iChan on channels oChan,
0183             applicable to samples corresponding to this pattern.  We can use it
0184             to fix samples for which iChan is invalid.
0185             %}
0186             
0187             y(mySamples,iChan) = ...
0188                (x(mySamples,oChan) - repmat(mn(2:end),numel(mySamples),1))... % first remove mean of other chans...
0189                 *(topcs*b') ...
0190                 + mn(1); % ... then restore mean of this channel
0191             
0192         end
0193 
0194         %{
0195         Now we fix the isolated samples that we skipped, using serial interpolation.
0196         %}
0197 
0198         MAXGAPSIZE=100;
0199         y(:,iChan)=fillgap(y(:,iChan),www,MAXGAPSIZE);
0200 
0201     end
0202 
0203 
0204     y=bsxfun(@times,y,save_amp); % restore the initial amplitude
0205     y=bsxfun(@plus,y,save_mean); % restore the initial mean
0206     
0207     v=min(std(x0(find(w))),std(y(find(w))));
0208     d = abs(y-x0); 
0209     w=double( (d/v < thresh) );
0210     score=mean( (d.*w)) ./ mean(w);
0211     disp([num2str(iIter), ', score: ',num2str(sum(score))]);
0212     
0213     ch=33; FOCUS=1:size(x,1);
0214     figure(200);
0215     %plot(x0(FOCUS,ch), 'k'); hold on
0216     %plot(w(FOCUS,ch) .* (y(FOCUS,ch)-x0(FOCUS,ch)));  hold on; drawnow
0217     plot(score); hold on; drawnow
0218     
0219 end % iterations
0220 
0221 
0222 if ~nargout
0223     % plot, don't return values
0224     disp(nt_wpwr(y)/nt_wpwr(x));
0225     figure(11); clf;
0226     subplot 311; plot(x0); title('raw'); xlim([1 size(x0,1)]);
0227     subplot 312; plot(y); title('projected on other channels'); xlim([1 size(x0,1)]);
0228     subplot 313; nt_imagescc(w'); title('weight');
0229     clear w
0230 end
0231 
0232 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0233 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0234 function [y,w]=fillgap(x,w,maxGapSize)
0235 %y=fillgap(x,w,maxGapSize) - fill gaps using simple interpolation
0236 %
0237 %  y: interpolated data
0238 %
0239 %  x: data to interpolate
0240 %  w: weighting function
0241 %  maxGapSize: largest expected gap size
0242 if nargin<2; error('!'); end
0243 if nargin<3||isempty(maxGapSize); maxGapSize=1; end
0244 if size(x,2)>1; error('!'); end
0245 if size(x) ~= size(w); error('!'); end
0246 y=x;
0247 if all(w); return; end
0248 % simple case size one
0249 iToFix=1+find(~w(2:end-1)&w(1:end-2)&w(3:end)); 
0250 y(iToFix)=(y(iToFix-1)+y(iToFix+1))/2;
0251 w(iToFix)=1; 
0252 % general case size > 1
0253 iStart=find(w(1:end-2) & ~w(2:end-1));  % position preceding gap
0254 iStop=find(~w(1:end-1) & w(2:end));     % last position in gap
0255 if isempty(iStart)||isempty(iStop); return; end
0256 if iStop(1)<iStart(1);
0257     iStop=iStop(2:end);                 % ignore gap at beginning
0258 end
0259 iStart=iStart(1:numel(iStop));          % ignore gap at end
0260 for gapSize=2:maxGapSize
0261     idx=find(iStop-iStart==gapSize);
0262     for k=1:gapSize
0263         % interpolate between samples on either side of gap
0264         y(iStart(idx)+k) = ( y(iStart(idx)) * (gapSize-k+1) + y(iStart(idx)+gapSize+1) * k ) / (gapSize+1);
0265         w(iStart(idx)+k) = 1;
0266     end
0267 end
0268 
0269 
0270 
0271 % create a dictionary of weight patterns
0272 function [ww,nOccurrences,iBack]=patternDict(w)
0273 % ww: dictionary of patterns
0274 % nOccurrences: number of times each pattern occurred
0275 % iBack: index to reconstruct input from dictionary
0276 [ww,~,IC,nOccurrences]=nt_unique(w,'rows');
0277 [nOccurrences,iSort]=sort(nOccurrences, 'descend'); % sort by decreasing number
0278 [~,iReverse]=sort(iSort); %
0279 ww=ww(iSort,:); % same order for patterns, w = ww(iReverse1(IC),:)
0280 iBack=iReverse(IC); % w = ww(iBack,:)
0281 
0282 %%% TEST %%%
0283 if 0
0284     nsources=3;
0285     x0=sin(2*pi*(1:10000)'*(1:nsources)/10000);
0286     x=x0*nt_normcol(randn(nsources,6));
0287     w=ones(size(x));
0288     x(1:1000,1)=10; w(1:1000,1)=0;
0289     x(2001:3000,1)=10; w(2001:3000,1)=0;
0290     x(1:2000,2)=10; w(1:2000,2)=0;
0291     %x=x+0.1*randn(size(x));
0292     %y=nt_inpaint(x,w);
0293     [ww,yy]=nt_outliers(x,[],1);
0294     figure(1); clf
0295     subplot 311; plot(x); title('raw');  subplot 312; plot(yy); title('interpolated'); subplot 313; nt_imagescc(w'); title('weights');
0296 end
0297 if 0
0298     N=20;
0299     nchans=50;
0300     x=zeros(1100,N);
0301     for k=1:N
0302         x(:,k)=sin(2*pi*k*(1:1100)'/1100);
0303     end
0304     x=x*randn(N,nchans);
0305     x=nt_normcol(x) + 0*randn(size(x));
0306     w=ones(size(x));
0307     x0=x;
0308     a=30
0309     for k=1:nchans
0310         x(k*20+(1:a),k)=20*randn(a,1);
0311         w(k*20+(1:a),k)=0;
0312     end
0313     [ww,yy]=nt_outliers(x,[],1);
0314     figure(1); clf
0315     subplot 311; plot(x); title('raw');  
0316     subplot 312; plot(yy); title('interpolated'); subplot 313; nt_imagescc(ww'); title('weights');
0317 end
0318 if 0
0319     N=10;
0320     nchans=20;
0321     nsamples=1100;
0322     x=zeros(nsamples,N);
0323     for k=1:N
0324         x(:,k)=sin(2*pi*k*(1:nsamples)'/nsamples);
0325     end
0326     x=x*randn(N,nchans);
0327 %    x=x+1*randn(size(x)); % add noise
0328     w=ones(size(x));
0329     x0=x;
0330     for k=1:nchans
0331         x(500+k*20+(1:40),k)=10;
0332         w(500+k*20+(1:40),k)=0;
0333     end
0334     [ww,y]=nt_outliers(x,[],1);
0335     figure(1); clf;
0336     subplot 412; plot(x); title ('raw');
0337     subplot 413; plot (y); title ('interpolated'); 
0338     subplot 414; nt_imagescc(ww'); title ('weights');
0339 end
0340 if 0
0341     [p,x]=nt_read_data('/data/meg/theoldmanandthesea/eeg/mc/MC_aespa_speech_45.mat'); x=x'; x=x(:,1:128); x=x(0000+(1:6000),:);
0342     %[p,x]=nt_read_data('/data/meg/arzounian/ADC_DA_140521_p20/ADC_DA_140521_p20_01_calib'); x=x'; x=x(1:10000,:);
0343     
0344     x=nt_demean(x);
0345     [x,w]=nt_detrend(x,3,[],[],3);   
0346     profile on; y=nt_inpaint(x,w); profile report;
0347     figure(1); clf
0348     subplot 311; plot(x); title('raw');  subplot 312; plot(y); title('clean'); subplot 313; plot(x-y); title('raw-clean');
0349     figure(2); clf
0350     ch=35;subplot 311; plot([x(:,ch),y(:,ch)]); subplot 312; plot(x(:,ch)-y(:,ch)); subplot 313; plot(w(:,ch), '.-');
0351     [ww,yy]=nt_outliers(x,[],1); drawnow;
0352     figure(4); clf;
0353     subplot 311; plot(x); 
0354     subplot 312; plot(yy);
0355     subplot 313; nt_imagescc(ww');
0356 end
0357 
0358

Generated on Thu 30-Nov-2017 17:26:18 by m2html © 2005