#define MODULE_SOURCE "generate_garrahan" /* generate_garrahan. 13 May 96. WJS */ /* Read ASCII-dumped version of P Garrahan bio-count spreadsheet */ /* data from MOC nets */ /* */ /* Input file is columnar with blanks between columns if */ /* necessary. Its lines logically fall into 2 sectionss; header & */ /* data. */ /* Each line in the data section consists of a species name, */ /* stage, and a count per net. Above each count, in the header */ /* section, is a series of lines describing the net. Above the */ /* species/stage columns, in the header section, is an ID for */ /* each of the header lines. This information is not used by */ /* generate_garrahan, except, possibly, to identify where the */ /* header section ends. */ /* It is assumed that one header line identifies the station */ /* containing the net. The information in this line and those */ /* above it are not output. The station line is identified as */ /* follows: the header lines above the counts are searched (case */ /* insensitively) for the directory containing the data file. */ /* If a match is found, the station line is the following line. */ /* If no match is found, the station line is the first non-blank */ /* line of the data file. Note that, logically, the station value */ /* (and the values in all header lines above it) are repeated over */ /* each count column. This is not checked, nor is the station */ /* value checked. */ /* There are other lines embedded in the data section. Blank */ /* lines (all spaces, possibly ending w/newline) are ignored, as */ /* are copies of the header section. "Title" or other ID lines */ /* are, unfortunately, treated as data */ /* */ /* Output is a defgb input file. */ /* When used the first time, generate_garrahan produces the */ /* top level of a 2-level structure, with variables from the */ /* header section of the data, along with an invokation of itself */ /* (with a -pass2 switch) to produce the next level. */ /* When called with the -pass2 switch, generate_ garrahan */ /* produces a 1-level structure with the species and stage */ /* variables, and the column of counts corresponding to the */ /* appropriate header information. */ /* */ /* generate_garrahan requires 5 arguments and accepts one switch */ /* The first argument is the input file name with its directory */ /* tree. The directory containing the file may be named with the */ /* the value of one of the header line variables-see above for */ /* the consequences of this. */ /* The second argument is a string that serves to uniquely iden- */ /* tify the last line of the header section */ /* The third argument is a comma separated list of the column */ /* widths. The sum of all widths must match or exceed the */ /* length of the significant data in the longest header line. */ /* There may be extra widths-the longest header line determines */ /* the last significant column. Data after the last column in */ /* subsequent data records will be ignored. */ /* The fourth argument is different depending on whether the */ /* switch is specified (see below). If the switch is not present, */ /* the argument is a comma-separated list of variable names */ /* corresponding to each header line variable. If the switch is */ /* present, the argument is a comma-separated list of which */ /* logical columns of data are to be output. (In normal use, */ /* the switch and the corresponding column list are produced by */ /* generate_garrahan itself, and need not be input by the user) */ /* The fifth argument is a comma-separated list of the variable */ /* names corresponding to the data column variables. The name of */ /* variable corresponding to the repeated columns (count) must */ /* be present exactly once. */ /* The switch is the string -pass2. If present, it must precede */ /* The fourth argument. This switch determines which level is to */ /* be output (see "Output" above) */ #define MODULE_VERSION "v 1.1 14 May 1996" /* 14 May 96. v 1.1. Skip copies of the header section. WJS */ /************************************************************************/ #include #include #include typedef char Logical; #define TRUE 1 #define FALSE 0 #define INP_FILE_NAME 0 #define LAST_HEAD_STRING_ARG 1 #define COL_WIDTHS_ARG 2 #define SELECTED_COLS_ARG 3 /* 2nd time through */ #define HEADER_LINE_VARNAME_ARG 3 /* 1st time through */ #define DATA_COL_VARNAME_ARG 4 #define NARGS DATA_COL_VARNAME_ARG+1 /* Manually adjust as args added */ #define MAX_COLS 100 #define INP_ARG_SEPARATOR "," #define INP_FIELD_SEPARATOR " " #define OUTP_FIELD_SEP_CHAR '\t' #define DEFGB_LEVEL_CONTINUATION_CHAR '>' #define DEFGB_COMMENT_CHAR '#' #define DEFGB_SCRIPT_INIT_CHAR '(' #define DEFGB_SCRIPT_CLOSE_CHAR ')' #define DIRSEP '/' #define MAXHEAD_LINES 20 #define MAXLENLINE 1257 /* Extra +1 is to allow for "line too long" test */ char header_lines[MAXHEAD_LINES][MAXLENLINE+1+1]; char *inp_arg[NARGS]; Logical in_stream_open = FALSE; FILE *in_stream; Logical pass2 = FALSE; /************************************************************************/ void ioclose() { if (in_stream_open) { fclose(in_stream); in_stream_open=FALSE; } return; } void err (s,t) char *s,*t; { printf ("%s %s\n",s,t); printf ("This message from %s %s\n",MODULE_SOURCE, MODULE_VERSION); ioclose(); exit(1); } void doswitch(switch_string) char *switch_string; { if (strcmp(switch_string,"pass2") == 0) pass2 = TRUE; else err ("Illegal switch: -",switch_string); return; } char *get_line(buf,buflen,stream) char *buf; int buflen; FILE *stream; /* Read until we get a non-blank line. Strip a trailing newline */ /* buflen is assumed to be 1 char greater than max logically */ /* permissible line; therefore, if there is a significant character */ /* in the last position of the buffer, we have a "too long" error */ { int j; while (fgets(buf,buflen,stream) != NULL) /* Skip blank lines (line consists of blanks and newlines) */ if ( (j = strlen(buf)) != strspn(buf," \n") ) { if (buf[j-1] == '\n') buf[--j] = '\0'; if (j >= buflen) err ("Input line too long",""); return buf; } return NULL; } int get_headerlines(stream,end_string) char *end_string; FILE *stream; { int i = -1; while (++i < MAXHEAD_LINES) { if (get_line(header_lines[i],MAXLENLINE+1,stream) == NULL) err ("Err/EOF before finding end-of-header string ",end_string); if (strstr(header_lines[i],end_string) != NULL) return i+1; } err ("Too many header lines without finding end-of-header string ", end_string); } int match_var_in_inp_dir_name(datafile,first_col,first_col_start, n_cols,col_widths,n_lines) char *datafile; int first_col,first_col_start,n_cols,n_lines,col_widths[]; /* Implicit input header_lines array */ /* Compare lowest level directory name with values of header line */ /* variables. Assumes that header line variables are in "level */ /* order"; eg, cruise, if present, precedes station, if present, etc. */ /* and that at least one column has a value for each variable. */ /* This work based on Groman scheme of ordering levels by directory */ /* structure, with directory names being the values of level variables */ /* Logically, all columns should have the same value of this variable */ /* (because of "level" idea), but we do not check this */ /* Return -1 if no match */ { int i,j,col,end_col; char *ptr,*ptr2,*dir,*dirptr,*field_start_ptr,*field_end_ptr,*end_directory; if ( (end_directory = strrchr(datafile,DIRSEP)) == NULL ) return -1; *end_directory = '\0'; if ( (dir = strrchr(datafile,DIRSEP)) == NULL ) dir = datafile; else dir++; end_col = first_col_start - 1; for (i=0; i field_end_ptr) || (ptr2 == NULL) ) ptr2 = field_end_ptr; dirptr = dir; while (ptr <= ptr2) if ( toupper(*(ptr++)) != toupper(*(dirptr++)) ) break; if (ptr > ptr2) { *end_directory = DIRSEP; return j; } } } } *end_directory = DIRSEP; return -1; } void count_cols(first_col_that_is_data,n_data_cols,first_data_col_char_number, col_array,list_col_names,col_widths_arg,n_headerlines) int *first_col_that_is_data,*n_data_cols,*first_data_col_char_number; int col_array[]; int n_headerlines; char *list_col_names,*col_widths_arg; /* Implicit input header_lines array */ /* Analyze header lines to find out number of columns in dataset and */ /* which of these represents the first "data column"; that is, which */ /* column contains the first repeated variable (count, in our case) */ /* Return which column this is, what character this column starts in, */ /* how many data columns there are, and an integer array of the widths */ /* of each column. */ /* Assumes that at least one header line has some information in the */ /* last data column */ { int maxlen,i,j; char *ptr; /* Find longest header line */ maxlen = strlen(header_lines[0]); for (i=0; i maxlen ) maxlen = j; /* Count number of distinct column variable names */ /* The last of those is the first data column */ *first_col_that_is_data = 0; ptr = list_col_names; while ( (ptr = strchr(ptr,*INP_ARG_SEPARATOR)) != NULL ) { (*first_col_that_is_data)++; ptr++; } /* Create array of column widths, stopping in column containing */ /* longest header line */ j=0; ptr = strtok(col_widths_arg,INP_ARG_SEPARATOR); for (i=0; i= maxlen ) break; ptr = strtok(NULL,INP_ARG_SEPARATOR); if (ptr != NULL) *(ptr-1) = *INP_ARG_SEPARATOR; } if (i >= MAX_COLS) err ("Too many columns for program",""); *n_data_cols = i - *first_col_that_is_data + 1; return; } Logical print_trimmed_col(col_start,col_width,chars_to_trim) char *col_start,*chars_to_trim; int col_width; /* Trims chars_to_trim off beginning and end of col_start string */ /* Does a putchar for chars in between. Returns FALSE if nothing in */ /* in between */ { char *ptr,*end_ptr; end_ptr = col_start + col_width; ptr = col_start + strspn(col_start,chars_to_trim); if ( (*ptr == '\0') || (ptr >= end_ptr) ) return FALSE; while (end_ptr > ptr) if (strchr(chars_to_trim,*(--end_ptr)) == NULL) break; printf("%.*s",end_ptr-ptr+1,ptr); return TRUE; } void do_pass2 (stream,n_cols,col_widths,n_headerlines,end_string) FILE *stream; int n_cols,n_headerlines; int col_widths[]; char *end_string; /* Generate def comment. Copy def variable list. */ /* Read columnar data, printing those selected. */ { int col_starts[MAX_COLS]; int this_selected_col = -1; int last_selected_col = 0; int i,selected_col,n_selected_cols; char *ptr; /* Extra +1 is to allow for "line too long" test */ char buf[MAXLENLINE+1+1]; printf ("%c {pass 2 of %s}\n",DEFGB_COMMENT_CHAR,MODULE_SOURCE); /* Put out var list, separating w/tabs instead of commas */ ptr=inp_arg[DATA_COL_VARNAME_ARG]-1; while ( *(++ptr) != '\0') if (*ptr == *INP_ARG_SEPARATOR) putchar(OUTP_FIELD_SEP_CHAR); else putchar (*ptr); putchar ('\n'); /* Make integer array of starts and widths of columns we want */ col_starts[0] = 0; ptr = strtok(inp_arg[SELECTED_COLS_ARG],INP_ARG_SEPARATOR); while (ptr != NULL) { if (sscanf(ptr,"%d",&selected_col) != 1) err ("Bad character in selected column arg ",ptr); this_selected_col++; if (selected_col < 0) err ("Selected column < 0",""); if (selected_col > 0) { if (selected_col > n_cols) err ("Selected column beyond last data column",""); if (selected_col <= last_selected_col) err ("Selected columns not in increasing order",""); col_starts[this_selected_col] = col_starts[this_selected_col-1]; for (i=last_selected_col; i strlen(buf)) printf("nd%c",OUTP_FIELD_SEP_CHAR); else if ( print_trimmed_col (buf+col_starts[i],col_widths[i],INP_FIELD_SEPARATOR) ) putchar(OUTP_FIELD_SEP_CHAR); else printf("nd%c",OUTP_FIELD_SEP_CHAR); putchar('\n'); } return; } main(argc,argv) int argc; char *argv[]; { char *ptr,*ptr2; int cols[MAX_COLS]; int i,j,col; int n_headerlines,first_logical_data_col,first_data_col; int n_logical_data_cols,n_interface_var,n_match_var; int n_switch_args = 0; i=1; j=0; while (i < argc) { if (*argv[i] == '-') { doswitch (argv[i++]+1); if (i == argc) err ("Not enough input args to program",""); } if (j >= NARGS) err ("Too many args to program",""); inp_arg[j++]=argv[i++]; } if (j != NARGS) err ("Not enough input args to program",""); /* Do this first since file name is used logically in program */ /* so if it's wrong, there are more problems than just not being */ /* able to find the file */ if ( (in_stream = fopen(inp_arg[INP_FILE_NAME],"r")) == NULL ) err ("Bad open of input file ",inp_arg[INP_FILE_NAME]); in_stream_open = TRUE; n_headerlines = get_headerlines(in_stream,inp_arg[LAST_HEAD_STRING_ARG]); count_cols(&first_logical_data_col,&n_logical_data_cols,&first_data_col,cols, inp_arg[DATA_COL_VARNAME_ARG],inp_arg[COL_WIDTHS_ARG],n_headerlines); if (pass2) { /* Total number of columns is first_ + n_ logical_data_cols */ do_pass2(in_stream,first_logical_data_col+n_logical_data_cols,cols, n_headerlines,inp_arg[LAST_HEAD_STRING_ARG]); exit(0); } fclose(in_stream); in_stream_open = FALSE; /* Compare variable values found in header lines with the */ /* directory in which the data file is found. Idea is that those */ /* variables need not be output-they are redundant. The variable */ /* in the next header line is the "interface" variable in that */ /* the script that invokes this program knows this variable value */ /* and is invoking this program to provide data corresponding to */ /* that variable value. */ /* Right now, we assume that the interface variable MUST be in the */ /* data, which means we will skip at least one header line variable */ /* The rest of the header line variable list is the variable list */ /* for this level, so output it */ n_match_var = match_var_in_inp_dir_name(inp_arg[INP_FILE_NAME], first_logical_data_col,first_data_col,n_logical_data_cols,cols, n_headerlines); /* Find names of matching and interface variables in inp string */ /* strtok destroys the beginning of the header line variable arg, */ /* but we don't care since this is the only use of it */ /* Match variable optional; next variable is interface variable */ /* and must be present; see comments above */ /* Print results as a comment line, ID'd by program name */ ptr=strtok(inp_arg[HEADER_LINE_VARNAME_ARG],INP_ARG_SEPARATOR); if (n_match_var < 0) printf ("%c ",DEFGB_COMMENT_CHAR); else { /* Skip to match variable name; print it */ for (i=0; i= n_logical_data_cols) printf ("[no value found]"); /* Advance to interface variable name */ ptr=strtok(NULL,INP_ARG_SEPARATOR); } /* Print interface variable name */ printf(" %s ",ptr); /* Print value of interface variable */ ptr = header_lines[n_match_var+1]+first_data_col; for (i=first_logical_data_col; i= n_logical_data_cols) printf ("[no value found]"); printf(" {%s}\n",MODULE_SOURCE); /* Put out rest of variable list, separating w/tabs instead of */ /* commas, finishing w/level continuation char as last list item */ while ( (ptr = strtok(NULL,INP_ARG_SEPARATOR)) != NULL ) printf("%s%c",ptr,OUTP_FIELD_SEP_CHAR); printf ("%c\n",DEFGB_LEVEL_CONTINUATION_CHAR); /* Variable list for next level is the list of data column */ /* variables. Put them out, separating w/tabs instead of commas */ ptr=inp_arg[DATA_COL_VARNAME_ARG]-1; while ( *(++ptr) != '\0') if (*ptr == *INP_ARG_SEPARATOR) putchar(OUTP_FIELD_SEP_CHAR); else putchar (*ptr); putchar ('\n'); /* Print one line per data column. Line consists of header */ /* line variables for that column, followed by invokation of */ /* generate_garrahan_level_n_plus_1 for that column */ col = first_data_col; for (i=0; i