function [Y,P] = msnorm(MZ,Y,varargin)
%MSNORM normalize a set of mass spectra
%
%   YOUT = MSNORM(MZ,Y) normalizes a group of mass spectra by standardizing
%   the area under the curve (AUC) to the group median. MZ is the mass- 
%   charge column vector with the same size as number of rows in Y. Y is a
%   matrix with column-wise spectrograms, all sharing the same MZ scale.  
%
%   MSNORM(...,'QUANTILE',QV) sets a 1-by-2 vector with the quantile
%   limits to select a reduced set of MZ positions. Default is [0 1], i.e.
%   it uses the whole AUC. For example, when QV = [0.9 1], only the largest
%   10% of ion intensities in every spectrogram are used to compute the
%   AUC. When QV is a scalar it represents the lower quantile limit and the
%   upper quantile limit is set to 1.
%
%   MSNORM(...,'LIMITS',LIM) sets a 1-by-2 vector with the valid MZ range
%   to pick points for normalization. This parameter is useful to eliminate
%   low-mass noise from the AUC calculation.
%
%   MSNORM(...,'CONSENSUS',CON). MZ positions are selected with a consensus
%   rule; to include a MZ position into the AUC its intensity must be
%   within the quantile limits of at least a CON proportion of the
%   spectrograms in Y. The same MZ positions are used to normalize all
%   the spectrograms. CON is a scalar between 0 and 1. 
%
%   MSNORM(...,'METHOD',METHOD) sets the target to which the AUC of every
%   spectrogram is normalized. METHOD can be 'Median' (default) or 'Mean'. 
%
%   MSNORM(...,'MAX',MAX). After individually normalizing every spectrogram,
%   they are further scaled to adjust the overall maximum intensity to MAX.
%   MAX is a scalar. If it is omitted, no postscaling is performed. 
%
%   MSNORM(MZ,Y,P) employs the information in P to normalize a new set of
%   spectra using the same parameters to select the MZ positions and output
%   scale as a previous normalization. P is a structure created by MSNORM.
%   If a consensus proportion was given in the previous normalization, no
%   new MZ positions are selected; normalization is performed using the
%   the same MZ positions. 
%
%   [YOUT,P] = MSNORM(...) returns a structure containing the necessary
%   parameters to normalize another set of spectra.
%
%   Examples: 
%   
%      load sample_lo_res
%      Y = Y_lo_res(:,[1 2 5 6]);
%      MZ = MZ_lo_res;
%
%      % Normalize the AUC of every spectrogram to its median.
%      Y1 = msnorm(MZ,Y);
%
%      % Normalize the AUC of every spectrogram to its median, eliminating
%      % low-mass noise, and postscaling such that the maximum intensity
%      % is 50.
%
%      Y2 = msnorm(MZ,Y,'LIMITS',[1000 inf],'MAX',50);
%
%      % Normalize the ion intensities of all spectra to the maximum
%      % intesity of the single highest peak from any of the spectra in the
%      % range above 1000 MZ.
%
%      Y3 = msnorm(MZ,Y,'QUANTILE',[1 1],'LIMITS',[1000 inf]);
%
%      % Select MZ regions whose intensities are within the third quartile
%      % in at least 90% of the spectrograms.
%
%      [Y4,S] = msnorm(MZ,Y,'QUANTILE',[0.5 0.75],'CONSENSUS',0.9);
%
%      % Use the same MZ regions to normalize another set of spectrograms.
%
%      Y5 = msnorm(MZ,Y,'PARAMETERS',S);
%
%   See also MSALIGN, MSBACKADJ, MSHEATMAP, MSLOWESS, MSRESAMPLE, MSSGOLAY,
%   MSVIEWER. 

%   Copyright 2003-2004 The MathWorks, Inc.
%   $Revision: 1.1.10.1 $  $Date: 2004/12/24 20:43:28 $

% References:
% [1] M. Wagner, D. Nalk, and A. Pothen, "Protocols for disease
%     classification from mass spectrometry data" Proteomics 3,pp.1692-1698
%     (2003).
% [2] G. Satten, et al in "Standardization and denoising algorithms for
%     mass spectra to classify whole-organism bacterial specimens"
%     Bioinformatics 2004 Jun 24 (Epub ahead of print)
% [3] Leping Li, David M. Umbach, Paul Terry and Jack A. Taylor,
%     "Application of the GA/KNN method to SELDI proteomics data" PNAS 2003.
% [4] Lilien et al in "Probabilistic Disease Classification of
%     Expression-Dependent Proteomic Data from Mass Spectrometry of Human
%     Serum", Journal of Computational Biology, 10(6) 2003, pp. 925-946.

% set defaults
parametersGiven = false;
outputScaleGiven = false;
consensusRateGiven = false;
methodIsMedian = true;
quantileValues = [0 1];
consensusRate = 0.75;
mzLimits = [0 inf];

% validate MZ and Y
if ~isnumeric(Y) || ~isreal(Y)
   error('Bioinfo:msnorm:IntensityNotNumericAndReal',...
         'Ion intensities must be numeric and real.') 
end

if ~isnumeric(MZ) || ~isreal(MZ) || ~isvector(MZ)
   error('Bioinfo:msnorm:MZNotNumericAndReal',...
         'MZ scale must be a numeric and real vector.') 
end

if size(MZ,1) ~= size(Y,1)
   error('Bioinfo:msnorm:NotEqualNumberOfSamples',...
         ['The ion intensity vector(s) must have the same number'...
          '\nof samples as the MZ scale.'])
end

[numSamples, numSpectrograms] = size(Y);

nvarargin =  numel(varargin); 
% check if third input is the structure with parameters
if nvarargin == 1 && isstruct(varargin{1})
    P = varargin{1};
    if ~isfield(P,'targetedSumI') || ~isfield(P,'mzh')
            error('Bioinfo:msnorm:NoMsnormStruct',...
                  'The parameters should be a struct generated by MSNORM.');
    end
    if ~isnan(P.mzh) % it is a normalization with consensus
        if numel(P.mzh)~=numSamples || ~islogical(P.mzh)
            error('Bioinfo:msnorm:NoMsnormStruct',...
                  ['The current MZ vector is not the same as the\n'...
                   'one used to generate the structure of parameters.']);
        end
    end
    parametersGiven = true;
else
    if nvarargin
        if rem(nvarargin,2)
            error('Bioinfo:msnorm:IncorrectNumberOfArguments',...
                'Incorrect number of arguments to %s.',mfilename);
        end
        okargs = {'quantile','consensusrate','limits','max','method'};
        for j=1:2:nvarargin
            pname = varargin{j};
            pval = varargin{j+1};
            if ~ischar(pname)
                error('Bioinfo:msnorm:ParameterNameNochar',...
                    'Parameter name must be a string.');
            end
            k = strmatch(lower(pname), okargs);  %#ok
            if isempty(k)
                error('Bioinfo:msnorm:UnknownParameterName',...
                    'Unknown parameter name: %s.',pname);
            elseif length(k)>1
                error('Bioinfo:msnorm:AmbiguousParameterName',...
                    'Ambiguous parameter name: %s.',pname);
            else
                switch(k)
                    case 1 % 'quantile'
                        if isscalar(pval)
                            if pval<0 || pval>1
                                error('Bioinfo:msnorm:badScalarQuantile',...
                                    ['Quantile limits must be a [1x2] vector with non decreasing\n'...
                                    'values between 0 and 1, or a scalar between 0 and 1.'])
                            else
                                quantileValues = [pval 1];
                            end
                        else
                            if numel(pval)~=2 || diff(pval)<0
                                error('Bioinfo:msnorm:badVectorQuantile',...
                                    ['Quantile limits must be a [1x2] vector with non decreasing\n'...
                                    'values between 0 and 1, or a scalar between 0 and 1.'])
                            else
                                quantileValues = [pval(1) pval(2)];
                            end
                        end
                    case 2 % 'consensusrate'
                        if ~isscalar(pval) || pval<0 || pval>1
                            error('Bioinfo:msnorm:badConsensus',...
                                'Consensus rate must be a scalar between 0 and 1.')
                        end
                        consensusRate = pval;
                        consensusRateGiven = true;
                    case 3 % 'limits'
                        if numel(pval)~=2 || diff(pval)<=0
                            error('Bioinfo:msnorm:badRange',...
                                'The limits must be provided in the form [MZmin, MZmax]')
                        end
                        mzLimits = [max(0,max(pval(1),MZ(1))),min(pval(2),MZ(end))];
                    case 4 % 'max'
                        if ~isscalar(pval)
                            error('Bioinfo:msnorm:badScale',...
                                'Maximum value of output range must be a scalar.')
                        end
                        outputScaleGiven = true;
                        outputScale = pval;
                    case 5 % 'method'
                        if ischar(pval)
                            okmethods = {'mean','median'};
                            nm = strmatch(lower(pval), okmethods);%#ok
                            if isempty(nm)
                                error('Bioinfo:msnorm:MethodNameNotValid',...
                                      'METHOD must be ''mean'' or ''median''.');
                            elseif length(nm)>1
                                error('Bioinfo:msnorm:AmbiguousMethodName',...
                                      'Ambiguous normalization method: %s.',pval);
                            else
                                methodIsMedian = nm == 2;
                            end
                        else
                             error('Bioinfo:msnorm:MethodNameNotValid',...
                                'METHOD must be ''mean'' or ''median''.');
                        end
                end
            end
        end
    end
end

if ~parametersGiven

    % search for the selected MZ values
   
    mzl = MZ>=mzLimits(1) & MZ<=mzLimits(2);
    Q = quantile(Y(mzl,:),quantileValues);
    if numSpectrograms == 1
        Q = Q';
    end

    if consensusRateGiven
        msCount = zeros(numSamples,1);
        for k = 1:numSpectrograms
            t = Y(:,k)>=Q(1,k) & Y(:,k)<=Q(2,k) & mzl;
            if ~sum(t)
                [dummy,h] = min(abs(Y(:,k)-Q(1,k)) + inf.*~mzl);  %#ok
                msCount(h) = msCount(h)+1;
            else
                msCount = msCount+t;
            end
        end % for k = ...
        mzh = msCount/numSpectrograms >= consensusRate;
        if ~sum(mzh)
            error('Bioinfo:msnorm:SelectedMZisEmpty',...
                ['The set of selected MZ positions is empty, either increase\n'...
                'the quantile limits or decrease the consensus rate.'])
        end
        % compute the sum of the intensities of selected points
        sumI = sum(Y(mzh,:));
    else %~consensusRateGiven
        sumI = zeros(numSpectrograms,1);
        for k = 1:numSpectrograms
            t = Y(:,k)>=Q(1,k) & Y(:,k)<=Q(2,k) & mzl;
            if ~sum(t)
                [dummy,h] = min(abs(Y(:,k)-Q(1,k)) + inf.*~mzl);  %#ok
                sumI(k) = Y(h,k);
            else
                sumI(k) = sum(Y(t,k));
            end
        end
        mzh = NaN; %will indicate it is a normalization does not use consensus
    end

    if methodIsMedian
        medSumI = median(sumI);
    else
        medSumI = mean(sumI);
    end

    % scale every spectrogram
    for k = 1:numSpectrograms
        Y(:,k) = Y(:,k) * (medSumI/sumI(k));
    end
 
    if outputScaleGiven
        K = outputScale/max(max(Y(mzl,:)));
        Y = Y * K;
        targetedSumI = K * medSumI;
    else
        targetedSumI = medSumI;
    end
    if nargout > 1
        P.mzh = mzh;
        P.targetedSumI = targetedSumI;
        if ~consensusRateGiven
            P.mzLimits = mzLimits;
            P.quantileValues = quantileValues;
        end
    end
else % parametersGiven
    if isnan(P.mzh) % it is a normalization without consensus
        mzl = MZ>=P.mzLimits(1) & MZ<=P.mzLimits(2);
        Q = quantile(Y(mzl,:),P.quantileValues);
        if numSpectrograms == 1
            Q = Q';
        end
        sumI = zeros(numSpectrograms,1);
        for k = 1:numSpectrograms
            t = Y(:,k)>=Q(1,k) & Y(:,k)<=Q(2,k) & mzl;
            if ~sum(t)
                [dummy,h] = min(abs(Y(:,k)-Q(1,k)) + inf.*~mzl);  %#ok
                sumI(k) = Y(h,k);
            else
                sumI(k) = sum(Y(t,k));
            end
        end
    else % it is a normalization with consensus, mz positions already chosen
        sumI = sum(Y(P.mzh,:));
    end
    % scale every spectrogram
    for k = 1:numSpectrograms
        Y(:,k) = Y(:,k) * (P.targetedSumI/sumI(k));
    end
end
