Home > NoiseTools > nt_outliers.m

nt_outliers

PURPOSE ^

[w,y]=nt_outliers(x,w,thresh,niter) - detect outliers based on weighted correlation structure

SYNOPSIS ^

function [w,y]=nt_outliers(x,w,thresh,niter)

DESCRIPTION ^

[w,y]=nt_outliers(x,w,thresh,niter) - detect outliers based on weighted correlation structure

  w: weights (indicates outliers)
  y: interpolated data

  x: data matrix
  w: initial weights
  thresh: threshold for declaring an outlier [default: 2]
  niter: number of iterations [default: 3]


 Noisetools.

CROSS-REFERENCE INFORMATION ^

This function calls: This function is called by:

SUBFUNCTIONS ^

SOURCE CODE ^

0001  function [w,y]=nt_outliers(x,w,thresh,niter)
0002 %[w,y]=nt_outliers(x,w,thresh,niter) - detect outliers based on weighted correlation structure
0003 %
0004 %  w: weights (indicates outliers)
0005 %  y: interpolated data
0006 %
0007 %  x: data matrix
0008 %  w: initial weights
0009 %  thresh: threshold for declaring an outlier [default: 2]
0010 %  niter: number of iterations [default: 3]
0011 %
0012 %
0013 % Noisetools.
0014 nt_greetings;
0015 
0016 PCA_THRESH=10^-15;
0017 nClosest=min(10,size(x,2)-1); % limit the number of neighbors to consider
0018 %nClosest=40;
0019 
0020 if nargin<1; error('!'); end
0021 if nargin<2||isempty(w); w=ones(size(x)); end
0022 if nargin<3||isempty(thresh); thresh=2; end
0023 if nargin<4||isempty(niter); niter=4; end
0024 if ndims(x)>2; error('!'); end
0025 if ~all(size(x)==size(w)); error('!'); end
0026 [nsamples,nchan]=size(x);
0027 
0028 
0029 %{
0030 On each iteration we reconstruct each channels based on other channels and
0031 then flag the parts that mismatch as corrupt.
0032 %}
0033 x0=x;
0034 
0035 figure(200); clf
0036 for iIter=1:niter
0037     
0038     %{
0039     We have multichannel data x and a multichannel weighting function w (0 or
0040     1). There are many configurations of valid/invalid channels to consider.
0041     List them.
0042     %}
0043     
0044     [ww,nOccurrences,iBack]=patternDict(w); % all patterns of good/bad channels
0045     nPatterns=size(ww,1);
0046     %disp('npatterns'); disp(nPatterns)
0047     %{
0048     Now we have a list of all the different weight patterns: ww. The
0049     vector iBack indicates which data samples fit each pattern: w = ww(iBack,:).
0050     %}
0051 
0052     %{
0053     Find which channels are 'neighbors' in terms of covariance.
0054     %}
0055 
0056     % weighted covariance matrix to determine which channels are close
0057     [x,save_mean]=nt_demean(x0,w); 
0058     [x,save_amp]=nt_normcol(x,w);
0059     xx=x.*w;
0060     c=(xx'*xx) ./ (w'*w); clear xx;
0061     c=abs(c); 
0062     sims=c+10*eye(size(c)); % make sure self always scores highest so we can skip it
0063 
0064     y=x;  
0065 
0066     %{
0067     We now have a matrix indicating proximity between channels. 
0068     %}
0069 
0070     %{
0071     For each channel, we calculate the projection matrix on the the subspace spanned 
0072     by other *valid* channels.  There are as many projection matrices as patterns 
0073     of valid/invalid channels.  Each projection matrix is estimated on data samples for
0074     which iChan is valid, and can be used to reconstruct data samples for which it is 
0075     invalid.
0076     %}
0077 
0078     for iChan=1:nchan
0079 
0080         %{
0081         We want to avoid having to consider all patterns of valid/unvalid 
0082         other channels. For that we'll group patterns. 
0083         First we order the other channels by decreasing similarity, putting
0084         invalid samples last. This needs to be done for each pattern.
0085         %}
0086 
0087         sim=sims(iChan,:);              % list of similarities with all other channels
0088         sim=repmat(sim,nPatterns,1);    % replicate so each pattern has own list
0089         sim((~ww))=0;                   % for each list, give bad channels a low score
0090         [~,closest]=sort(abs(sim),2,'descend');     % sort each list by decreasing similarity
0091         for k=1:size(closest,1);
0092             closest(k,find(sim(k,closest(k,:))==0))=0;     % mark bad channels as 0
0093         end
0094         for k=1:size(closest,1);
0095             if closest(k,1)==iChan; 
0096                 closest(k,1:end-1)=closest(k,2:end);
0097              else
0098                 % iChan was bad so not first
0099             end
0100         end
0101         closest=closest(:,1:end-1);     % last not valid if first skipped
0102 
0103         %{
0104         We now have, for each pattern, a list of channels closest to iChan. 
0105         There are a lot of different patterns, so we merge those for which the nClosest 
0106         channels are the same.
0107         %}
0108 
0109         % group patterns for which the nClosest most similar channels are the same
0110         [C,IA,IC]=unique(closest(:,1:nClosest),'rows');
0111         iBack2=IC(iBack);       % maps each pattern to the data that fit it
0112 
0113         %{
0114         We now have a smaller array C of reduced patterns. The
0115         vector iBack2 indicates which data samples correspond to each pattern.
0116         %}
0117 
0118         %{
0119         For some patterns, only a few samples are invalid. Skip these
0120         and iterpolate later using serial interpolation.
0121         %}
0122         
0123 
0124         toFix=[];
0125         NSKIP=4; % skip if fewer samples
0126         www=ones(size(x,1),1);
0127         for iPattern=1:size(C,1)
0128             mySamples=find(iBack2==iPattern); 
0129             if numel(mySamples)<=NSKIP
0130                 www(mySamples)=0;
0131             else
0132                 toFix=[toFix,iPattern];
0133             end
0134         end
0135         C=C(toFix,:);
0136 
0137         %disp(size(C,1))
0138 
0139         for iPattern=1:size(C,1)
0140 
0141             %{
0142             Estimate matrix to project iChan on the other channels listed in this
0143             pattern. 
0144             %}
0145 
0146             %disp([iChan iPattern])
0147             oChan=C(iPattern,:);
0148             %disp(oChan)
0149             oChan(find(oChan==0))=[]; % exclude bad channels
0150             
0151             if any(iChan==oChan); error('!'); end
0152 
0153             % samples corresponding to this pattern
0154             mySamples=find(iBack2==toFix(iPattern)); 
0155 
0156             % select data for which iChan *and* oChan are valid
0157             iBothValid=all(w(:,[iChan,oChan]),2);        
0158             xxx=x(iBothValid, [iChan,oChan]);  
0159             %figure(3); clf; plot (iBothValid); title([iChan oChan]);
0160 
0161             %%% --> we should be able to avoid this situation
0162             if isempty(xxx); 
0163                 disp([iChan, iPattern]); disp('empty'); 
0164                 continue; % we won't estimate or fix anything
0165             end
0166 
0167             % calculate covariance matrix
0168             mn=mean(xxx,1);
0169             xxx=nt_demean(xxx); % remove mean first
0170             ccc=xxx'*xxx;
0171 
0172             % PCA other channels to remove weak dimensions
0173             [topcs,eigenvalues]=nt_pcarot(ccc(2:end,2:end));
0174             idx=find(eigenvalues/max(eigenvalues) > PCA_THRESH); % discard weak dims
0175             topcs=topcs(:,idx);
0176 
0177             % projection matrix
0178             b=ccc(1,2:end)*topcs / (topcs'*ccc(2:end,2:end)*topcs);  
0179             
0180             %{
0181             We now have the projection matrix to project channel iChan on channels oChan,
0182             applicable to samples corresponding to this pattern.  We can use it
0183             to fix samples for which iChan is invalid.
0184             %}
0185             
0186             y(mySamples,iChan) = ...
0187                (x(mySamples,oChan) - repmat(mn(2:end),numel(mySamples),1))... % first remove mean of other chans...
0188                 *(topcs*b') ...
0189                 + mn(1); % ... then restore mean of this channel
0190             
0191         end
0192 
0193         %{
0194         Now we fix the isolated samples that we skipped, using serial interpolation.
0195         %}
0196 
0197         MAXGAPSIZE=100;
0198         y(:,iChan)=fillgap(y(:,iChan),www,MAXGAPSIZE);
0199 
0200     end
0201 
0202 
0203     y=bsxfun(@times,y,save_amp); % restore the initial amplitude
0204     y=bsxfun(@plus,y,save_mean); % restore the initial mean
0205     
0206     v=min(std(x0(find(w))),std(y(find(w))));
0207     d = abs(y-x0); 
0208     w=double( (d/v < thresh) );
0209     score=mean( (d.*w)) ./ mean(w);
0210     disp([num2str(iIter), ', score: ',num2str(sum(score))]);
0211     
0212     ch=33; FOCUS=1:size(x,1);
0213     figure(200);
0214     %plot(x0(FOCUS,ch), 'k'); hold on
0215     %plot(w(FOCUS,ch) .* (y(FOCUS,ch)-x0(FOCUS,ch)));  hold on; drawnow
0216     plot(score); hold on; drawnow
0217     
0218 end % iterations
0219 
0220 
0221 if ~nargout
0222     % plot, don't return values
0223     disp(nt_wpwr(y)/nt_wpwr(x));
0224     figure(11); clf;
0225     subplot 311; plot(x0); title('raw'); xlim([1 size(x0,1)]);
0226     subplot 312; plot(y); title('projected on other channels'); xlim([1 size(x0,1)]);
0227     subplot 313; nt_imagescc(w'); title('weight');
0228     clear w
0229 end
0230 
0231 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0232 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
0233 function [y,w]=fillgap(x,w,maxGapSize)
0234 %y=fillgap(x,w,maxGapSize) - fill gaps using simple interpolation
0235 %
0236 %  y: interpolated data
0237 %
0238 %  x: data to interpolate
0239 %  w: weighting function
0240 %  maxGapSize: largest expected gap size
0241 if nargin<2; error('!'); end
0242 if nargin<3||isempty(maxGapSize); maxGapSize=1; end
0243 if size(x,2)>1; error('!'); end
0244 if size(x) ~= size(w); error('!'); end
0245 y=x;
0246 if all(w); return; end
0247 % simple case size one
0248 iToFix=1+find(~w(2:end-1)&w(1:end-2)&w(3:end)); 
0249 y(iToFix)=(y(iToFix-1)+y(iToFix+1))/2;
0250 w(iToFix)=1; 
0251 % general case size > 1
0252 iStart=find(w(1:end-2) & ~w(2:end-1));  % position preceding gap
0253 iStop=find(~w(1:end-1) & w(2:end));     % last position in gap
0254 if isempty(iStart)||isempty(iStop); return; end
0255 if iStop(1)<iStart(1);
0256     iStop=iStop(2:end);                 % ignore gap at beginning
0257 end
0258 iStart=iStart(1:numel(iStop));          % ignore gap at end
0259 for gapSize=2:maxGapSize
0260     idx=find(iStop-iStart==gapSize);
0261     for k=1:gapSize
0262         % interpolate between samples on either side of gap
0263         y(iStart(idx)+k) = ( y(iStart(idx)) * (gapSize-k+1) + y(iStart(idx)+gapSize+1) * k ) / (gapSize+1);
0264         w(iStart(idx)+k) = 1;
0265     end
0266 end
0267 
0268 
0269 
0270 % create a dictionary of weight patterns
0271 function [ww,nOccurrences,iBack]=patternDict(w)
0272 % ww: dictionary of patterns
0273 % nOccurrences: number of times each pattern occurred
0274 % iBack: index to reconstruct input from dictionary
0275 [ww,~,IC,nOccurrences]=nt_unique(w,'rows');
0276 [nOccurrences,iSort]=sort(nOccurrences, 'descend'); % sort by decreasing number
0277 [~,iReverse]=sort(iSort); %
0278 ww=ww(iSort,:); % same order for patterns, w = ww(iReverse1(IC),:)
0279 iBack=iReverse(IC); % w = ww(iBack,:)
0280 
0281 %%% TEST %%%
0282 if 0
0283     nsources=3;
0284     x0=sin(2*pi*(1:10000)'*(1:nsources)/10000);
0285     x=x0*nt_normcol(randn(nsources,6));
0286     w=ones(size(x));
0287     x(1:1000,1)=10; w(1:1000,1)=0;
0288     x(2001:3000,1)=10; w(2001:3000,1)=0;
0289     x(1:2000,2)=10; w(1:2000,2)=0;
0290     %x=x+0.1*randn(size(x));
0291     %y=nt_inpaint(x,w);
0292     [ww,yy]=nt_outliers(x,[],1);
0293     figure(1); clf
0294     subplot 311; plot(x); title('raw');  subplot 312; plot(yy); title('interpolated'); subplot 313; nt_imagescc(w'); title('weights');
0295 end
0296 if 0
0297     N=20;
0298     nchans=50;
0299     x=zeros(1100,N);
0300     for k=1:N
0301         x(:,k)=sin(2*pi*k*(1:1100)'/1100);
0302     end
0303     x=x*randn(N,nchans);
0304     x=nt_normcol(x) + 0*randn(size(x));
0305     w=ones(size(x));
0306     x0=x;
0307     a=30
0308     for k=1:nchans
0309         x(k*20+(1:a),k)=20*randn(a,1);
0310         w(k*20+(1:a),k)=0;
0311     end
0312     [ww,yy]=nt_outliers(x,[],1);
0313     figure(1); clf
0314     subplot 311; plot(x); title('raw');  
0315     subplot 312; plot(yy); title('interpolated'); subplot 313; nt_imagescc(ww'); title('weights');
0316 end
0317 if 0
0318     N=10;
0319     nchans=20;
0320     nsamples=1100;
0321     x=zeros(nsamples,N);
0322     for k=1:N
0323         x(:,k)=sin(2*pi*k*(1:nsamples)'/nsamples);
0324     end
0325     x=x*randn(N,nchans);
0326 %    x=x+1*randn(size(x)); % add noise
0327     w=ones(size(x));
0328     x0=x;
0329     for k=1:nchans
0330         x(500+k*20+(1:40),k)=10;
0331         w(500+k*20+(1:40),k)=0;
0332     end
0333     [ww,y]=nt_outliers(x,[],1);
0334     figure(1); clf;
0335     subplot 412; plot(x); title ('raw');
0336     subplot 413; plot (y); title ('interpolated'); 
0337     subplot 414; nt_imagescc(ww'); title ('weights');
0338 end
0339 if 0
0340     [p,x]=nt_read_data('/data/meg/theoldmanandthesea/eeg/mc/MC_aespa_speech_45.mat'); x=x'; x=x(:,1:128); x=x(0000+(1:6000),:);
0341     %[p,x]=nt_read_data('/data/meg/arzounian/ADC_DA_140521_p20/ADC_DA_140521_p20_01_calib'); x=x'; x=x(1:10000,:);
0342     
0343     x=nt_demean(x);
0344     [x,w]=nt_detrend(x,3,[],[],3);   
0345     profile on; y=nt_inpaint(x,w); profile report;
0346     figure(1); clf
0347     subplot 311; plot(x); title('raw');  subplot 312; plot(y); title('clean'); subplot 313; plot(x-y); title('raw-clean');
0348     figure(2); clf
0349     ch=35;subplot 311; plot([x(:,ch),y(:,ch)]); subplot 312; plot(x(:,ch)-y(:,ch)); subplot 313; plot(w(:,ch), '.-');
0350     [ww,yy]=nt_outliers(x,[],1); drawnow;
0351     figure(4); clf;
0352     subplot 311; plot(x); 
0353     subplot 312; plot(yy);
0354     subplot 313; nt_imagescc(ww');
0355 end
0356 
0357

Generated on Sat 29-Apr-2023 17:15:46 by m2html © 2005