function [outputStruct,vals]=geosoftread(geotext)
%GEOSOFTREAD reads in Gene Expression Omnibus SOFT format data files.
%
%   GEODATA = GEOSOFTREAD(FILE) reads in Gene Expression Omnibus (GEO) SOFT
%   format data from FILE and creates a structure GEODATA, containing the
%   following fields:
%       Scope
%       Accession
%       Header
%       ColumnDescriptions
%       ColumnNames
%       Data
%
%   FILE can also be a URL or a MATLAB character array that contains the
%   text of a GEO SOFT format file.
%
%   Example:
%
%       % Get a file from GEO and save it to a file.
%       geodata = getgeodata('GSM3258','TOFILE','GSM3258.txt')
%
%       % In subsequent MATLAB sessions you can use geosoftread to access the
%       % local copy from disk instead of accessing it from the GEO web site.
%       geodata = geosoftread('GSM3258.txt')
%
%   See also GALREAD, GETGEODATA, GPRREAD, SPTREAD.

% Copyright 2003-2004 The MathWorks, Inc.
% $Revision: 1.8.6.4 $   $Date: 2004/12/24 20:44:15 $

if ~ischar(geotext) && ~iscellstr(gptext)
    error('Bioinfo:geosoftread:InvalidStringInput',...
        'The input is not an array of characters or a cell array of strings.');
end

% If the input is a string of GenBank data then first character should be ^
isAstring = ischar(geotext) && ~isempty(geotext) && geotext(1) == '^';


if ~isAstring 
    if ~iscellstr(geotext)
        if (strfind(geotext, '://'))
            if (~usejava('jvm'))
                error('Bioinfo:geosoftread:NoJava','Reading from a URL requires Java.')
            end
            % must be a URL
            geotext = urlread(geotext);
            % clean up any &amp s
            geotext=strrep(geotext,'&amp;','&');
        else
            if exist(geotext,'file')
                % it is a file
                geotext = textread(geotext,'%s','delimiter','\n','whitespace','');
                
            elseif ~exist(geotext) %#ok
                error('Bioinfo:geosoftread:FileNotExist','The file %s does not exist.',geotext)
            end
        end
    else
        error('Bioinfo:geosoftread:InvalidSOFTFile','FILE is not a valid SOFT file.')
    end
else
    geotext = strread(geotext,'%s','delimiter','\n');
end


%line number
% ln = 1;

emptyLines = cellfun('isempty',geotext);
geotext(emptyLines) = [];
numLines = size(geotext,1);

% format as follows
% ^ start line with ID
% ! comment lines
% # column descriptions
% data values

colHeaderInfo = strncmp(geotext,'#',1);
firstColHeader = find(colHeaderInfo,1);
startLn = max(firstColHeader -1,1);

GEOType = lower(strtok(geotext{1}(2:end),'='));
% SAMPLE data
switch GEOType
    case 'sample'
        try
    [outputStruct.Scope, outputStruct.Accession] = strtok(geotext{1},'='); 
    outputStruct.Scope = outputStruct.Scope(2:end);
    outputStruct.Accession = strtrim(outputStruct.Accession(2:end));
    ln = startLn+1;
    
    while geotext{ln}(1) == '!'
        geotext{ln}(1) = '';
        ln = ln+1;
    end
    outputStruct.Header.Type ='Gene Expression Omnibus'; 
    outputStruct.Header.Text = char(geotext(2:ln-1));
    
    colStart = ln;
    while geotext{ln}(1) == '#'
        geotext{ln}(1) = '';
        ln = ln+1;
    end
    outputStruct.ColumnDescriptions = geotext(colStart:ln-1);
   
    outputStruct.ColumnNames = strread(geotext{ln},'%s','delimiter','\t');
    numCols = numel(outputStruct.ColumnDescriptions);
    numRows = numLines-ln;
    geotext{ln+1} = strrep(geotext{ln+1},'Error','NaN');
    geotext{ln+1} = strrep(geotext{ln+1},'error','NaN');
    geotext{ln+1} = strrep(geotext{ln+1},'null','NaN');
    minusTab = sprintf('-\t');
    minusZeroTab = sprintf('-0\t');
    geotext{ln+1} = strrep(geotext{ln+1},minusTab,minusZeroTab);
    splitLine = strread(geotext{ln+1},'%s','delimiter','\t');
    isNumeric = true(1,numCols);
    for count = 1:numCols
        if isempty(splitLine{count}) || ~isempty(regexp(splitLine{count},'[^0-9.eE\-]','once')) && ~strcmp(splitLine{count},'NaN')
            isNumeric(count) = false;
        end
    end
    badDataWarning = false;
    if all(isNumeric)
        vals = zeros(numRows,numCols);
        for theLine = 1:numRows
            try
                vals(theLine,:) = strread(geotext{theLine+ln},'%f','delimiter','\t','emptyvalue',NaN)';
            catch
                geotext{theLine+ln} = strrep(geotext{theLine+ln},'Error','NaN');
                geotext{theLine+ln} = strrep(geotext{theLine+ln},'error','NaN');
                geotext{theLine+ln} = strrep(geotext{theLine+ln},'null','NaN');
                geotext{theLine+ln} = strrep(geotext{theLine+ln},minusTab,minusZeroTab);
                try
                    vals(theLine,:) = strread(geotext{theLine+ln},'%f','delimiter','\t','emptyvalue',NaN)';
                catch
                    vals(theLine,:) = nan(1,numCols);
                    
                    badDataWarning = true;
                end
            end
        end
    else
        vals = cell(numRows,numCols);
        percents = repmat('%',1,numCols);
        Fs = repmat('s',1,numCols);
        Fs(isNumeric) = 'f';
        formatString = reshape([percents;Fs],1,2*numCols);

        for theLine = 1:numRows
            %             if ~mod(theLine,100)
            %                 theLine
            %             end
            if isempty(geotext{theLine+ln})
                continue
            end
            try
                [vals{theLine,:}] = strread(geotext{theLine+ln},formatString,'delimiter','\t','emptyvalue',NaN);
            catch
                geotext{theLine+ln} = strrep(geotext{theLine+ln},'Error','NaN');
                geotext{theLine+ln} = strrep(geotext{theLine+ln},'error','NaN');
                geotext{theLine+ln} = strrep(geotext{theLine+ln},'null','NaN');
                geotext{theLine+ln} = strrep(geotext{theLine+ln},minusTab,minusZeroTab);
                try
                    [vals{theLine,:}] = strread(geotext{theLine+ln},formatString,'delimiter','\t','emptyvalue',NaN);
                catch
                    %      disp('here');
                    vals(theLine,:) = repmat({NaN},1,numCols);
                    badDataWarning = true;
                end
            end
            for cols = 1:numCols
                if ~isNumeric(cols)
                    vals{theLine,cols} = char(vals{theLine,cols});
                end
                
            end
        end
    end 
    if badDataWarning
        warning('Bioinfo:geosoftread:BadGEOData',...
            'Unable to read some lines of the file. Missing entries will be replaced with NaNs.');
    end
    outputStruct.Data = vals;
        catch
    warning('Bioinfo:geosoftread:IncompleteGEOFile','Problems reading the GEO data. The structure may be incomplete.');
        end



        %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
    case 'database'
        % look for dataset line

        try
            dsLines = strmatch('^dataset',geotext);%#ok


            [outputStruct.Scope, outputStruct.Accession] = strtok(geotext{dsLines(1)},'=');
            outputStruct.Scope = strtrim(outputStruct.Scope(2:end));
            outputStruct.Accession = strtrim(outputStruct.Accession(2:end));
            ln = startLn+1;

            while geotext{ln}(1) == '!'
                geotext{ln}(1) = '';
                ln = ln+1;
            end
            outputStruct.Header.Type ='Gene Expression Omnibus';
            outputStruct.Header.Text = char(geotext(2:ln-1));

            colStart = ln;
            while geotext{ln}(1) == '#'
                geotext{ln}(1) = '';
                ln = ln+1;
            end
            outputStruct.ColumnDescriptions = geotext(colStart:ln-1);

            outputStruct.ColumnNames = strread(geotext{ln},'%s','delimiter','\t');
            numCols = numel(outputStruct.ColumnDescriptions);
            numRows = numLines-ln;
            isNumeric = true(1,numCols);
            for theRow = 1:numRows
                geotext{ln+theRow} = strrep(geotext{ln+theRow},'Error','NaN');
                geotext{ln+theRow} = strrep(geotext{ln+theRow},'error','NaN');
                geotext{ln+theRow} = strrep(geotext{ln+theRow},'null','NaN');
                minusTab = sprintf('-\t');
                minusZeroTab = sprintf('-0\t');
                geotext{ln+theRow} = strrep(geotext{ln+theRow},minusTab,minusZeroTab);
                splitLine = strread(geotext{ln+theRow},'%s','delimiter','\t');

                for count = 1:numCols
                    if isNumeric(count) && (isempty(sscanf(splitLine{count},'%f'))) 
                        isNumeric(count) = false;
                    end
                end
            end
            badDataWarning = false;
            if all(isNumeric)
                vals = zeros(numRows,numCols);
                for theLine = 1:numRows
                    try
                        vals(theLine,:) = strread(geotext{theLine+ln},'%f','delimiter','\t','emptyvalue',NaN)';
                    catch
                        geotext{theLine+ln} = strrep(geotext{theLine+ln},'Error','NaN');
                        geotext{theLine+ln} = strrep(geotext{theLine+ln},'error','NaN');
                        geotext{theLine+ln} = strrep(geotext{theLine+ln},'null','NaN');
                        geotext{theLine+ln} = strrep(geotext{theLine+ln},minusTab,minusZeroTab);
                        try
                            vals(theLine,:) = strread(geotext{theLine+ln},'%f','delimiter','\t','emptyvalue',NaN)';
                        catch
                            vals(theLine,:) = nan(1,numCols);

                            badDataWarning = true;
                        end
                    end
                end
            else
                vals = cell(numRows,numCols);
                percents = repmat('%',1,numCols);
                Fs = repmat('s',1,numCols);
                Fs(isNumeric) = 'f';
                formatString = reshape([percents;Fs],1,2*numCols);
                charCols = find(~isNumeric);
                numCharCols = numel(charCols);
                for theLine = 1:numRows
                    %             if ~mod(theLine,100)
                    %                 theLine
                    %             end
                    if isempty(geotext{theLine+ln})
                        continue
                    end
                    try
                        [vals{theLine,:}] = strread(geotext{theLine+ln},formatString,'delimiter','\t','emptyvalue',NaN);
                    catch
                        geotext{theLine+ln} = strrep(geotext{theLine+ln},'Error','NaN');
                        geotext{theLine+ln} = strrep(geotext{theLine+ln},'error','NaN');
                        geotext{theLine+ln} = strrep(geotext{theLine+ln},'null','NaN');
                        geotext{theLine+ln} = strrep(geotext{theLine+ln},minusTab,minusZeroTab);
                        try
                            [vals{theLine,:}] = strread(geotext{theLine+ln},formatString,'delimiter','\t','emptyvalue',NaN);
                        catch
                            %      disp('here');
                            vals(theLine,:) = repmat({NaN},1,numCols);
                            badDataWarning = true;
                        end
                    end
                    for charCol = 1:numCharCols
                        vals{theLine,charCols(charCol)} = char(vals{theLine,charCols(charCol)});
                     end
                end
            end
            if badDataWarning
                warning('Bioinfo:geosoftread:BadGEOData',...
                    'Unable to read some lines of the file. Missing entries will be replaced with NaNs.');
            end
            outputStruct.Data = vals;
        catch
            warning('Bioinfo:geosoftread:IncompleteGEOFile','Problems reading the GEO data. The structure may be incomplete.');
        end
    case 'series'
        error('Bioinfo:geosoftread:SeriesNotSupported',...
            'GEOSOFTREAD does not currently support Series (GSE) records.');
    otherwise
         error('Bioinfo:geosoftread:UnknownGEOFormat',...
            'Unknown GEO format.\nGEOSOFTREAD supports Sample (GSM) and Database (GDS) records.');
end
