/*joinu from C. Hammond, from puddle.mit.edu 31 October 1996 r groman "It needs to be linked with outer.o (some variation - whatever you are calling yours) and jgofs.a which is usually in ../lib." */ /* Doc comments of v 2.3 (updated for 2.4) moved to end of source */ #define JOIN_VERSION "join version 2.6e 9 Apr 2016" /* 9 Apr 16. v 2.6e WJS */ /* Address note 1, 30 Sep 97 revision. It's correct! so control */ /* the (&&#*! PATH_INFO now that we are using join from OOserver */ /* on local objects */ /* Improve jdbopen error text and include error status */ /* [Begin v 2.6e] */ /* 20 Jan 10. v 2.6d WJS */ /* Let's call this join instead of joinu. Sorry if the u was */ /* significant */ /* Bug fix: incorrect use of array length as index to final array */ /* element. Caused SEGV (= server 500 in web interface) */ /* [Begin v 2.6d] */ /* 6 Aug 05. v 2.6c WJS */ /* errn needs to be declared */ /* Use level_splits from library */ /* [Needs utils 1.9] */ /* [Begin v 2.6c] */ /* 11 Nov 04. v 2.6b WJS */ /* Mods for new build procedure. Involves changes to defn files */ /* version ID stuff, and inclusion of an err routine. At some */ /* point, could probably go back and use code from utils in- */ /* stead of what I'm sure is duplicate here */ /* [Begin v 2.6b] */ /* 24 Sep 03. v 2.6a WJS */ /* Bug fix: "cumulative variable/level array" wasn't cumulative */ /* Anything w/ > 2 levels should have failed. */ /* [Begin v 2.6a] */ /* 8 May 00. v 2.6 WJS */ /* Extend 2.5 relevelization as follows: Suppose a level of an */ /* input object has no match set variable on it. Suppose that */ /* in that object, a match set variable occurs N levels above */ /* the level w/no matcher. Then, in the joined object, be sure */ /* that variables from the level w/no matcher occur N levels */ /* after the level now containing that nearest match set var. */ /* Eg; if "NO2" occurs 1 level below "eventno" in the input, be */ /* sure it occurs 1 level below "eventno" in the output */ /* [Begin v 2.6] */ /* 4 Feb 00. v 2.5 WJS */ /* Bug fix (probably): (The probably refers to the fix. There's */ /* definitely a bug) When creating levels for joined object, */ /* we have always pushed down a variable in the match set to the */ /* lower of its positions in the 2 input objects. I think all */ /* variables on the "pushed from" level in the "pushed from" ob- */ /* ject should be pushed down, not just the match set variable */ /* [Begin v 2.5] */ /* 16 Jan 00. v 2.4a WJS */ /* Return width attribute as long as we went to all the trouble */ /* to calculate it! */ /* Bug fix: 13 Jan bug fix STILL wrong - this time due to bad */ /* internal comments */ /* [Begin v 2.4a] */ /* 13 Jan 00. v 2.4 WJS */ /* Reorder variables in joined object to try to retain as much of */ /* the order of the input objects as we can */ /* Bug fix: 21 Dec bug fix incorrect. Try again */ /* [Begin v 2.4] */ /* 21 Dec 99. WJS */ /* Add diagnostic for missing object name(s) */ /* Bug fix: width calc did not correctly maximize if var was in */ /* both objects. Suspect bug has been in all versions */ /* 16 Dec 99. WJS */ /* Modify memory scheme so that we don't allocate unused space */ /* per chunk */ /* 18 Nov 99. WJS */ /* Add right outer join and full outer join */ /* [Begin v 2.3] */ /* 8 Nov 99. WJS */ /* Bug fix: off-by-one error in a malloc. This bug has */ /* existed since at least vers 2.1 */ /* 4 Nov 99. WJS */ /* Accept JOINTYPE from environment, too */ /* 30 Oct 99. WJS */ /* Bug fix: left outer join multi-level logic faulty */ /* 28 Oct 99. WJS */ /* Add diagnostic if match set empty */ /* Add left outer join capability, controlled by compilation */ /* switch JOINTYPE */ /* [Begin v 2.2] */ /* 27 Sep 99. WJS */ /* Bug fix: counting error. Again, amazed it even seemed to work! */ /* [Begin v 2.1a] */ /* 4 May 99. WJS */ /* Fix massive bug caused by assuming jdbread_ gives a "line" */ /* instead of a level */ /* Add version info */ /* Add iovaldouble_ entry */ /* Define required functions */ /* Change #includes to OPTIONS & INNEROPTIONS per other methods */ /* (this mod in official OO version but don't know how it got */ /* there) */ /* Incorporate CLH fix of 13 Nov 98 made to official OO version */ /* [Begin v 2.1] */ /* 17 Oct 97. WJS */ /* Dynamically allocate space for saved object */ /* Correct bad parametrization */ /* 2 Oct 97. WJS */ /* Dynamically shape values2 */ /* Add alpha join (assumes alpha vals are illegal numerically) */ /* Provide non-zero widths */ /* Allow tokens of size DATUMSIZE (instead of DATUMSIZE-1) */ /* JGOFS 1.5 mods: */ /* Call error_ instead of doing printf */ /* Change iovalreal_ alpha value test */ /* Add in_ to internal function names */ /* #include *default.h files & reparametrize accordingly */ /* 30 Sep 97. WJS */ /* Process any input width attributes, and add iowidths_ entry */ /* (per JGOFS 1.5) to return this information. */ /* Type the void io*_ entries */ /* Notes: */ /* 1) requires PATH_INFO control if used on local objects */ /* Otherwise, if the environment variable is set to */ /* .html or .flat, program breaks */ /* 2) does not propagate attributes other than width */ /***********************/ #include INNEROPTIONS #include "utils.h" #include "path_info_routines.h" /* Get compile-time type of join */ #define INNER 1 #define LEFTOUTER 2 #define RIGHTOUTER 3 #define OUTER 4 #define FULLOUTER OUTER /* User convenience... */ #ifndef JOINTYPE #define JOINTYPE INNER #endif unsigned char doing_rightouter = FALSE; unsigned char doing_leftouter = FALSE; /* char statements try to force join type into image for ID purposes */ #if JOINTYPE == INNER char jointype_id[] = "compiled join type: inner"; #elif JOINTYPE == LEFTOUTER doing_leftouter = TRUE; char jointype_id[] = "compiled join type: left outer"; #elif JOINTYPE == RIGHTOUTER doing_rightouter = TRUE; char jointype_id[] = "compiled join type: right outer"; #elif JOINTYPE == OUTER doing_rightouter = TRUE; doing_leftouter = TRUE; char jointype_id[] = "compiled join type: full outer"; #else char jointype_id[] = "compiled join type: unknown"; #error "JOINTYPE must be INNER, OUTER, RIGHTOUTER, or LEFTOUTER" #endif /***********************/ void error_(); /* ... entry in outer */ /* utils routines */ Logical add_id_to_err(); void errn(); int *level_splits(); char *buildstring(); /* #define DEBUG */ #ifndef MEMORY_ALLOCATION_CHUNK #define MEMORY_ALLOCATION_CHUNK 41960 #endif #define POINTERS_CHUNK 100 /* Information about variables in joined structure. A variable */ /* can be in the 1st object, the 2nd object, or both. If the */ /* variable is in both objects, the .index value is negative. */ /* If the variable is in exactly 1 object, .index is positive. */ /* If .index is positive, its value tells which object and */ /* .pntr tells where within that object. If .index is negative, */ /* it indicates where the variable is in the 2nd object */ /* (value is -(location+1)) and .pntr is set to where it is in the */ /* 1st object. .level tells what level the variable occupies */ /* in the joined structure */ struct varinf { int pntr,index,level; } v[NVAR]; int nlevels[2]; char names[2][NVAR][VARNAMESIZE+1]; int *fldwidths,*fldwidths_for_attr; int namesize=VARNAMESIZE+1; int valuesize=DATUMSIZE+1; char *values; /* values[NVAR][DATUMSIZE+1]; ptr for convenience */ /* Dynamic memory scheme: */ /* Logically, we want values2 [nlines2+1] [nvars2] [valuesize] */ /* We could allocate, say, NLINES worth, and, after filling it, */ /* allocate 2*NLINES, copy the original space into the new, free */ /* the original, etc. This is a lot of bytes to move, however. */ /* Instead, we make an array of pointers to chunks of size NLINES */ /* The pointers start out NULL. We allocate NLINES worth to the */ /* first pointer. When we exceed LINES, we allocate NLINES */ /* worth to the second pointer, and start using that. Logically */ /* this breaks up the [nlines2+1] dimension into 2 dimensions. */ /* One is "which chunk_size/NLINES chunk?". The other runs */ /* from 0 to NLINES-1 within each chunk. */ /* If we run out of pointers, we dynamically allocate more space, */ /* copy over, free, etc, but we do this to the pointers array, */ /* which is lots easier to move. */ /* There is a question of how much memory to allocate at once. */ /* I chose 41960 bytes, but it can be specified as a */ /* compilation constant (MEMORY_ALLOCATION_CHUNK). It */ /* should be tuned to the efficiency of the system's memory */ /* allocation scheme (actually, the system's malloc routine's */ /* allocation scheme). The actual allocation quantity is mod- */ /* fied up if necessary to hold at least one line, then further */ /* modified down to exactly hold an integral number of lines */ char **values2 = NULL; char *val2lineptr; int nlines2,linesize2; int nlines2perchunk; int handle[2]={ -1, -1}; int nt[2]; int match[NVAR]; /* For each variable in the 1st object, position of */ /* same variable within 2nd object's variable list */ /* or -1 if not found. Might be able to remove this */ /* array due to mod to v structure (introduced in */ /* join version 2.3), but didn't try... */ /* Stuff for building lists of match set variables that occur on */ /* the same level in the output object */ struct match_levels { int lev_in_obj0; /* Level of this match set variable in inp obj 0 */ int lev_in_obj1; /* Level of this match set variable in inp obj 1 */ int next; /* Pointer to next match set variable in this output level */ } *mlist; int *mlist_start; int match_set_out_levels; int maxlev,minlevelread; char *comments; unsigned char match_found; /* Let iovalstr_ know whether or not most */ /* recent ioreadrec_ had a match */ unsigned char *matchflag; /* Flag per flat "line" of in-memory object */ /* indicating whether or not line has been */ /* matched to a line of the out-of-mem obj */ int *level_this_line = NULL; /* Which level closest to 0 changed */ /* for each flat "line" */ unsigned char doing_unmatched_records_2nd_obj = FALSE; /************************************************************************/ char *join_return_vers() /* Dummy routine. Exists only to force .h file version string into */ /* this module. Note string must not be global or we'll have con- */ /* flicts if another routine similarly includes the version string */ { static char version[] = JOIN_VERSION"/"FULL_UTILSH_VERSION; return version; } void err(s,t) char *s,*t; { char *ss,*tt; add_id_to_err(&ss,&tt,s,t,JOIN_VERSION); error_(ss,tt); return; /* Not that it should ever get here... */ } void in_get_runtime_jointype() { char *ptr; if ( (ptr = getenv("JOINTYPE")) != NULL ) { /* Default to inner join, then adjust as needed. */ /* If env var is null, we're using compile-time vals & doing* */ /* has already been set */ doing_leftouter = FALSE; doing_rightouter = FALSE; if ((strcmp(ptr,"leftouter") == 0) || (strcmp(ptr,"LEFTOUTER") == 0)) doing_leftouter = TRUE; else if ((strcmp(ptr,"rightouter") == 0) || (strcmp(ptr,"RIGHTOUTER") == 0)) doing_rightouter = TRUE; else if ((strcmp(ptr,"outer") == 0) || (strcmp(ptr,"OUTER") == 0) || (strcmp(ptr,"fullouter") == 0) || (strcmp(ptr,"FULLOUTER") == 0) ){ doing_leftouter = TRUE; doing_rightouter = TRUE; } else if ((strcmp(ptr,"inner") == 0) || (strcmp(ptr,"INNER") == 0)) ; else error_("Bad JOINTYPE environment variable",ptr); } return; } int in_get_width(object_selector,variable_number) int object_selector,variable_number; /* Return as function value the width attribute found for variable */ /* "variable_number" in object whose jdbxxx_ "handle" is selected by */ /* "object_selector". If no width attribute, return 0. If > 1 */ /* attribute (! - shouldn't be), return value of first one */ { int j = 0; char str[ATTRSIZE]; while ( jdbattributes_(handle+object_selector, &variable_number, str) ) if (strncmp(str,"width=",6) == 0) { j = atoi(str+6); break; } return j; } int in_getrec(object_selector,buf) int object_selector; char *buf; { return jdbreada_(handle+object_selector,buf,&valuesize); } int in_getrec_mem(lineptr) char **lineptr; /* sets *lineptr to "next" line from in-memory object. Returns a */ /* "jdbread" status */ { int i,retval; char *val2ptr,*endptr; int ichunk,ilineinchunk; div_t div_struct; static int obj2_line = -1; retval = (obj2_line < 0) ? 0 : level_this_line[obj2_line+1]; while (++obj2_line < nlines2) { /* Note that the level must be adjusted even if a particular */ /* obj2_line is not returned for use */ if (level_this_line[obj2_line] < retval) retval = level_this_line[obj2_line]; if ( ! matchflag[obj2_line]) { div_struct = div(obj2_line,nlines2perchunk); ichunk = div_struct.quot; ilineinchunk = div_struct.rem; /* Logically, this selects the start of line obj2_line */ *lineptr = values2[ichunk] + ilineinchunk*linesize2; break; } } return (obj2_line == nlines2) ? -1 : retval; } int in_get_chunk (object_selector,chunk,linesize,nlines,level_splits, count,level_per_line) int object_selector,linesize,nlines; int *level_splits,*count,*level_per_line; char *chunk; /* Reads, into chunk, up to nlines of size linesize from object */ /* associated with object_selector. */ /* Returns number of lines read (!= nlines equivalent to EOF) */ { int linecnt,j,lev; char *start_this_line; static char *start_previous_line; start_this_line = chunk - linesize; for (linecnt=0; linecnt infinity in steps of */ /* POINTERS_CHUNK */ int ichunk; /* Goes from 0 -> POINTERS_CHUNK. */ /* Actual chunk pointer is */ /* chunk_pointer_base + ichunk */ char *ptr; char **temp; char **object_chunk_ptrs; int allocate; object_chunk_ptrs = *data_array; if (MEMORY_ALLOCATION_CHUNK <= linesize) allocate = linesize; else { *nlinesperchunk = MEMORY_ALLOCATION_CHUNK /linesize; allocate = *nlinesperchunk * linesize; } linecnt = 0; chunk_pointer_base = 0; ichunk = 0; while (1) { if (ichunk == 0) { /* Out of pointers to chunks. Allocate one POINTERS_CHUNK */ /* more than we have */ object_chunk_ptrs = (char **) realloc( object_chunk_ptrs, (chunk_pointer_base + POINTERS_CHUNK) * sizeof(char *) ); if (object_chunk_ptrs == NULL) errn("Could not get memory for 'object_chunk_ptrs'. nbytes=", (chunk_pointer_base + POINTERS_CHUNK) * sizeof(char *) ); } /* Allocate a chunk of memory and fill it with a chunk's worth */ /* of lines from the object. */ /* Save pointer to chunk in object_chunk_ptrs */ ptr = (char *) malloc(allocate); if (ptr == NULL) errn("Could not get memory for 'object_chunk_ptrs[i]'. nbytes=",allocate); /* Need memory for each line in this chunk, too */ level_this_line = (int *) realloc ( level_this_line, (linecnt + *nlinesperchunk) * sizeof (int *) ); if (level_this_line == NULL) errn("Could not get memory for 'level_this_line'", (linecnt + *nlinesperchunk) * sizeof (int *) ); i = in_get_chunk(object_selector,ptr,linesize,*nlinesperchunk,splits, &linecnt,level_this_line); object_chunk_ptrs[chunk_pointer_base+ichunk] = ptr; if (i != *nlinesperchunk) break; if (++ichunk == POINTERS_CHUNK) { chunk_pointer_base += POINTERS_CHUNK; ichunk = 0; } } *data_array = object_chunk_ptrs; jdbclose_(handle+object_selector); /* Original code made an extra line of "nd"s for some reason. If it */ /* supposed to "not match anything", "nd"s aren't really good - I */ /* suggest "**NoTrEaLdAtUm**" or some such. In any case, though, */ /* the returned line count was never incremented, so that line */ /* wasn't used. Just in case, commented out code follows... */ /* If needed, it must be modified for dynamic memory scheme */ /* for (i=0; ilevel > vj->level) return TRUE; if (vi->level < vj->level) return FALSE; /* Vars in both objects precede vars in just 1 object */ if ((vi->index >= 0) && (vj->index < 0)) return TRUE; if ((vi->index < 0) && (vj->index >= 0)) return FALSE; /* If both vars in both objects, use order within object 1 */ if ((vi->index < 0) && (vj->index < 0)) return (vi->pntr > vj->pntr); /* If vars in same object use order within that object. Otherwise */ /* vars from object 1 go first */ return (vi->index == vj->index) ? (vi->pntr > vj->pntr) : (vj->index == 0); } int in_iotest(val2lineptr) char **val2lineptr; /* Searches whole in-memory object for a line that matches the */ /* "current" line from the out-of-memory object. Returns the number */ /* of that line, or nlines2 if no match. val2lineptr is set to */ /* point to the start of the line from the in-memory object if there */ /* is a match. It points to the last line of the in-memory object */ /* if there is no match, but should not be used in that case */ { int i; double v0,v1; char *val2ptr,*endptr; int ichunk,ilineinchunk; int testline; ichunk = -1; ilineinchunk = nlines2perchunk-1; for (testline = 0; testline < nlines2; testline++){ if (++ilineinchunk == nlines2perchunk) { ichunk++; ilineinchunk = 0; } /* Logically, this selects the start of line testline */ *val2lineptr = values2[ichunk] + ilineinchunk*linesize2; for (i = 0; i < nt[0]; i++) if (match[i] >= 0) { val2ptr = *val2lineptr + match[i]*valuesize; v0 = strtod(values + i*valuesize, &endptr); if (*endptr == '\0') v1 = strtod(val2ptr,&endptr); if (*endptr == '\0') { if (v0 != v1) break; } else { if (strcmp(values + i*valuesize, val2ptr) != 0) break; } } if (i == nt[0]) break; } return testline; } unsigned char lev1s_match(level,list_start) int level,list_start; { int i; i = list_start; while (i >= 0) { if (mlist[i].lev_in_obj1 == level) break; i = mlist[i].next; } return (i >= 0); } int ioopen_(params,nparams,ntotal) char *params[]; int *nparams,*ntotal; /* s[0..nparams-1]: parameter strings. Inner sets s[j][0]=0 for any strings which it processes; others will be processed by outer. Thus selection/projections would normally be ignored by inner. nparams: number of parameter strings ntotal (returned): total number of variable names */ { char *tmp; struct varinf tempv; int i,i1,j,j1,k; int ih; int obj,var; /* Temp cumulative variable/level array for "second" object */ int *obj2_level_splits; /* level_map[2][maxlevel]... maps levels from each input object to */ /* corresponding levels joined object */ int *level_map[2]; /* nearest_match[2][maxlevel] records which match set variable is */ /* closest "above" or on a particular level for each object. */ /* where_nearest_match[2][maxlevel] records how many levels above */ /* nearest_match is */ int *nearest_match[2],*where_nearest_match[2]; int lev1,last_obj0_level,last_i1; int iptr[2]; unsigned char levs_in_obj1_match; unsigned char found_maxlev; /* PATH_INFO function and strings. */ int new_and_old_path_infos(); /* "PATH_INFO=" + getenv("PATH_INFO"). Must be static */ /* since it "survives" in process table after ioopen_ exits) */ static char *PATH_INFO_orig_putenv; /* "PATH_INFO=" + getenv("PATH_INFO") w/ protocol = "jgof" */ char *PATH_INFO_jgof_putenv; in_get_runtime_jointype(); if (new_and_old_path_infos(&PATH_INFO_jgof_putenv,&PATH_INFO_orig_putenv) != PATH_INFO_NEW_AND_OLD_OK) err ("problem processing PATH_INFO env var. Memory issues or bad fmt. env var=", getenv(PATH_INFO_ENV_VAR) ); if (PATH_INFO_jgof_putenv != NULL) if ( (i = putenv(PATH_INFO_jgof_putenv)) != 0 ) err ("putenv failure for PATH_INFO",strerror(i)); /* Size data buffer big enough to hold a "line" of data */ values = (char *) malloc (NVAR*valuesize); if (values == NULL) errn("Could not get memory for 'values' buffer. nbytes=",NVAR*valuesize); if ((params[0] == NULL) || (params[1] == NULL)) err("Missing one or both input object names",""); i = strlen(params[0]); j = strlen(params[1]); if ((i == 0) || (j == 0)) err ("No characters in one or both input object names",""); /* Size a couple of buffers that hold object spec(s) */ /* We haven't parametrized that in defaults.h or innerdefaults.h */ /* (could use PATHSIZE) but our idea is to go dynamic anyway (& */ /* dynamic is better!) */ /* comments never used, but leave it in for now ... length is */ /* length of "join ", a blank, a terminating null, and params */ comments = (char *) malloc (5 + i + 1 + j + 1); if (comments == NULL) errn("Could not get memory for 'comments' buffer. nbytes=",5 + i + 1 + j + 1); strcpy(comments,"join "); strcat(comments,params[0]); strcat(comments," "); strcat(comments,params[1]); /* tmp needs to hold each spec consecutively, so size to max */ tmp = (i > j) ? (char *)malloc(i+1) : (char *)malloc(j+1); if (tmp == NULL) errn("Could not get memory for 'tmp' buffer. nbytes=",j+1); for (ih = 0; ih <= 1; ih++) { strcpy(tmp,params[ih]); nt[ih] = -NVAR; nlevels[ih] = jdbopen_(&handle[ih],tmp,names[ih],&namesize,&nt[ih]); if (nlevels[ih] < 0) { char jdb_status_error_buf[80]; /* Big enough to "do" next line */ sprintf (jdb_status_error_buf,"\njdbopen_: %d",nlevels[ih]); err(buildstring( "jdbopen problem trying to open object ", tmp, jdb_status_error_buf), "" ); } nlevels[ih]++; /* jdbopen really returns "maxlevel" not "nlevels" */ params[ih][0] = 0; } /* Restore original PATH_INFO if there was one */ if (PATH_INFO_orig_putenv != NULL) if ( (i = putenv(PATH_INFO_orig_putenv)) != 0 ) err ("putenv failure restoring PATH_INFO: ",strerror(i)); free (tmp); /* Calculate variable list for joined object and levels those */ /* variables have in joined object. The list consists of the */ /* variables in exactly one object, along with the variables that */ /* appear in both objects. In general, the level of a variable in */ /* the combined object is the same as the level of that variable */ /* in its own object. However, it is possible that the variables */ /* that appear in both objects appear at different levels. In */ /* such a case, use the "further away from level 0" value between */ /* the different levels. As of version 2.5, also move all other */ /* variables on the same input level "down" along with the common */ /* variable. If, after this has been done for all common vari- */ /* bles, the joined object has levels with no variables on them, */ /* "compress out" such levels */ /* Number of match set variables less than min of nvars in either input */ i = (nt[0] < nt[1]) ? nt[0] : nt[1]; mlist = (struct match_levels *)malloc(i * sizeof(struct match_levels)); /* Max number of levs of output obj is max number of levs in either inp */ j = (nlevels[0] > nlevels[1]) ? nlevels[0] : nlevels[1]; mlist_start = (int *)malloc(j * sizeof(int)); if ((mlist == NULL) || (mlist_start == NULL)) errn("Could not get memory for 'mlist' &/or 'mlist_start'. nbytes=", j*sizeof(int)); last_obj0_level = -1; match_set_out_levels = 0; k = 0; for (i = 0; i < nt[0]; i++) { for (j = 0; j < nt[1]; j++) if (!strcmp(names[0][i],names[1][j])) break; if (j == nt[1]) match[i]= -1; else { match[i] = j; names[1][j][0] = '\0'; /* Create a list of match set variables. Link all match set */ /* variables that occur on the same level in object 0 */ mlist[k].lev_in_obj0 = jdblevel_(&handle[0],&i); mlist[k].lev_in_obj1 = jdblevel_(&handle[1],&j); mlist[k].next = -1; if (mlist[k].lev_in_obj0 == last_obj0_level) mlist[k-1].next = k; else { last_obj0_level = mlist[k].lev_in_obj0; mlist_start[match_set_out_levels++] = k; } k++; } } if (k == 0) err("No variables in common between 2 input objects",""); /* For each level of each object, make a note of the nearest */ /* match set variable "above" (or on) it */ /******** This work (the 2.6 upgrade) apparently not completed ******/ /******** nearest_match arrays unused elsewhere, and note ******/ /******** empty loop, which, along w/this outer loop, appeared ******/ /******** in the first 2.6 source [this comment in 2.6b] ******/ for (ih = 0; ih <= 1; ih++) { nearest_match[ih] = (int *)malloc(nlevels[ih] * sizeof(int)); where_nearest_match[ih] = (int *)malloc(nlevels[ih] * sizeof(int)); if ((nearest_match[ih] == NULL) || (where_nearest_match[ih] == NULL)) errn("Could not get memory for '*nearest_match'. nbytes=", nlevels[ih] * sizeof(int) ); for (j = 0; j < nlevels[ih]; j++) { } } /* Link match set members to each other if they occur on the same */ /* level in either object. For example, if match set member 1 */ /* occurs on the same level as member 2 in object 1, and member 3 */ /* in object 2, link 1, 2 and 3 */ /* We start from a set of lists whose members must stay together */ /* because they are at the same level in object 0. Join 2 lists */ /* if ANY of their object 1 levels match. Note we must have at */ /* least 1 list and any list must have at least 1 member */ for (i = 0; i < match_set_out_levels; i++) { i1 = mlist_start[i]; levs_in_obj1_match = FALSE; while ((i1 >= 0) && (! levs_in_obj1_match)) { lev1 = mlist[i1].lev_in_obj1; for (j = i+1; j < match_set_out_levels; j++) if (levs_in_obj1_match = lev1s_match(lev1,mlist_start[j])) break; last_i1 = i1; i1 = mlist[i1].next; } /* If there was a match anywhere, concatenate the 2 lists (and */ /* reduce the number of lists) */ if (levs_in_obj1_match) { /* Advance to list end if not already there */ while (i1 > 0) { last_i1 = i1; i1 = mlist[i1].next; } mlist[last_i1].next = mlist_start[j]; mlist_start[j] = mlist_start[--match_set_out_levels]; } } /* For each inp object, create a map of its original levels to */ /* levels in the output object. Init these map such that level N */ /* of inp obj corresponds to level N of output obj */ for (ih = 0; ih <= 1; ih++) { level_map[ih] = (int *)malloc(nlevels[ih] * sizeof(int)); if (level_map[ih] == NULL) errn("Could not get memory for 'level_map'. nbytes=", nlevels[ih] * sizeof(int) ); for (j = 0; j < nlevels[ih]; j++) level_map[ih][j] = j; } /* Warp the maps! We have lists of match set variables that must */ /* occur on the same output level. They "drag along" the rest of */ /* the variables on the same level from their input objects. The */ /* level everything gets dragged to is the max level (either inp */ /* object) of all the vars on each list. */ for (i = 0; i < match_set_out_levels; i++) { /* Get max level for this match set output level */ i1 = mlist_start[i]; j1 = 0; while (i1 >= 0) { if (j1 < mlist[i1].lev_in_obj0) j1 = mlist[i1].lev_in_obj0; if (j1 < mlist[i1].lev_in_obj1) j1 = mlist[i1].lev_in_obj1; i1 = mlist[i1].next; } /* Adjust every input level (both objects) that went into this */ /* match set output level to at least this max. Check higher */ /* too - we might have gone up > 1 level, and relative order of */ /* the input levels must be preserved (kind of - we can "flat- */ /* ten" but not "invert") */ i1 = mlist_start[i]; while (i1 >= 0) { j = mlist[i1].lev_in_obj0; if (level_map[0][j] < j1) { level_map[0][j++] = j1; for ( ; j < nlevels[0]; j++) if (level_map[0][j] < j1) level_map[0][j] = j1; } j = mlist[i1].lev_in_obj1; if (level_map[1][j] < j1) { level_map[1][j++] = j1; for ( ; j < nlevels[1]; j++) if (level_map[1][j] < j1) level_map[1][j] = j1; } i1 = mlist[i1].next; } } free(mlist); free(mlist_start); /* It's possible at this point that the joined object has levels */ /* w/no variables in it (eg, suppose there are 2 common variables, */ /* and suppose in object 1 they occur on levels 0 & 1, and in ob- */ /* ject 2 they occur on levels 1 and 0. The joined object will */ /* have nothing on level 1 and everything on level 2). Compress */ /* such levels out */ maxlev = 0; iptr[0] = iptr[1] = 0; while ((iptr[0] < nlevels[0]) || (iptr[1] < nlevels[1])) { for (i = iptr[0]; i < nlevels[0]; i++) if (level_map[0][i] > maxlev) break; iptr[0] = i; found_maxlev = (i > 0) ? (level_map[0][i-1] == maxlev) : FALSE; for (j = iptr[1]; j < nlevels[1]; j++) if (level_map[1][j] > maxlev) break; iptr[1] = j; if ( ! found_maxlev) found_maxlev = (j > 0) ? (level_map[1][j-1] == maxlev) : FALSE; if (found_maxlev) maxlev++; else { j1 = (level_map[0][i] < level_map[1][j]) ? level_map[0][i] : level_map[1][j]; j1 -= maxlev; for ( ; i < nlevels[0]; i++) level_map[0][i] -= j1; for ( ; j < nlevels[1]; j++) level_map[1][j] -= j1; } } maxlev--; k = 0; for (i = 0; i < nt[0]; i++) { /* kth var in joined object comes from out-of-mem obj or from */ /* both objs. Make special, negative, index to indicate latter */ /* "Extra" -1 is to cover match[i] = 0 */ v[k].index = (match[i] == -1) ? 0 : -match[i]-1; v[k].level = level_map [0] [ jdblevel_(&handle[0],&i) ]; v[k++].pntr = i; } for (i = 0; i < nt[1]; i++) if (names[1][i][0] != '\0') { v[k].level = level_map [1] [ jdblevel_(&handle[1],&i) ]; v[k].index = 1; /* kth var in joined object comes from in-mem obj */ v[k++].pntr = i; } free (level_map[0]); free (level_map[1]); /* Sort variables. See in_switchvars for criteria */ for (i = 0; i < k-1; i++) for (j = i+1; j < k; j++) if (in_switchvars(&v[i],&v[j])) { tempv = v[i]; v[i] = v[j]; v[j] = tempv; } /* See if there are any width attributes in either input object */ /* for each variable. Use the max of the length of the variable */ /* name, the attribute in object 1 (if any) and the attribute in */ /* object 2 (if any). */ fldwidths = (int *) malloc (k * sizeof(int)); if (fldwidths == NULL) errn("Could not get memory for 'fldwidths'. nbytes=",k*sizeof(int)); fldwidths_for_attr = (int *) malloc (k * sizeof(int)); if (fldwidths_for_attr == NULL) errn("Could not get memory for 'fldwidths_for_attr. nbytes=",k*sizeof(int)); for (i = 0; i < k; i++) { obj = (v[i].index <= 0) ? 0 : 1; var = v[i].pntr; j = strlen (names[obj][var]); j1 = in_get_width (obj,var); if (j1 > j) j = j1; /* If in both objects, check 2nd object */ if (v[i].index < 0) { obj = 1; var = -v[i].index-1; j1 = in_get_width (obj,var); if (j1 > j) j = j1; } fldwidths_for_attr[i] = fldwidths[i] = j; } minlevelread = maxlev+1; /* For the in-memory object, want cumulative variable/level array */ obj2_level_splits = level_splits (handle[1], nlevels[1]-1, nt[1]); if (obj2_level_splits == NULL) errn("Could not get memory for 'obj2_level_splits' buffer. nbytes=", (nlevels[1] + 1)*sizeof(int) ); linesize2 = nt[1]*valuesize; nlines2 = in_get_entire_object (1,&values2,linesize2,&nlines2perchunk,obj2_level_splits); free (obj2_level_splits); if (doing_rightouter) { matchflag = (unsigned char *)malloc(nlines2); if (matchflag == NULL) errn("Could not get memory for 'matchflag' buffer. nbytes=",nlines2); for (i = 0; i < nlines2; i++) matchflag[i] = FALSE; } *ntotal = k; return maxlev; } int ioreadrec_(level) int *level; /* Read record at appropriate level. Return 0 if end at that level. Return 1 if ok. */ { static unsigned char eofflag = FALSE; int in_memory_line; if (eofflag) return 0; if (*level > minlevelread) return 0; if (*level == minlevelread) { minlevelread++; return 1; } /* Use next label via goto (gasp!) exactly once - when we reach */ /* the end of the out-of-memory object. We don't want to report */ /* EOF. We want to send back the first unmatched in-memory object */ /* record. Hence, we come here after "switching modes" to process */ /* the in-memory object */ get_next_rec: match_found = FALSE; /* Get a record for the composite object. We never actually put */ /* such a thing together - we set up pointers to the "current" */ /* record in both objects and let iovalstr_ pick from either */ /* record depending on the variable requested. */ /* values is the normal pointer. It points to an array into */ /* which jdbread places information from the out-of-memory object */ /* val2lineptr is the one for the in-memory object. It points to */ /* a memory location previously filled in during get_entire_object */ /* Thus, values is a constant pointer to space repeatedly filled */ /* in, while val2lineptr is a variable pointer to various places */ /* previously read in. Note that in right outer joins, values is */ /* not used while processing the "leftovers". Things should work */ /* such that iovalstr_ will always be filling in "nd"s */ if (doing_unmatched_records_2nd_obj) { /* Doing in-memory object. */ /* We've processed the out-of-memory object, during which time */ /* we output any records from the in-memory object that matched. */ /* We're on to processing unmatched records from the in-memory */ /* object. There is no match testing - we're only here because */ /* we want to do a right outer join, processing unmatched, */ /* records, we know the matched records have been output, and */ /* in_getrec_mem only returns records that have NOT been output. */ /* iovalstr_ needs the match_found = FALSE value anyway; see */ /* discussions various other places */ while (minlevelread > *level) { minlevelread = in_getrec_mem(&val2lineptr); if (minlevelread < 0) { /* End of in-memory object. We're all done with whole join */ eofflag = TRUE; return 0; } } } else { /* Doing out-of-memory object */ /* Send all records from 1st object (for this level) back to */ /* outer if doing left outer join. If not doing left outer */ /* join, only send back records that match. Note we want the */ /* value of match_found even if we are doing left outer join. */ /* iovalstr_ needs it to know when to supply "nd"s. */ while (TRUE) { while (minlevelread > *level) { minlevelread = in_getrec(0,values); if (minlevelread < 0) /* End of out-of-memory object. Switch to process */ /* previously unmatched records from in-memory object */ /* or quit, as appropriate */ if (doing_rightouter) { doing_unmatched_records_2nd_obj = TRUE; minlevelread = maxlev+1; /* Reset to force "read" */ goto get_next_rec; } else { eofflag = TRUE; return 0; } } /* We have next proper-level line from out-of-memory object */ /* See if we have a match to some in-memory object line & */ /* record the result if needed. Then decide whether or not */ /* line we just got is satisfactory to us. If so, break */ /* from loop. If not, make sure we'll do a read & go do one */ in_memory_line = in_iotest(&val2lineptr); match_found = (in_memory_line != nlines2); if (doing_rightouter && match_found) matchflag[in_memory_line] = TRUE; if (doing_leftouter || match_found) break; minlevelread = maxlev + 1; } } if (minlevelread < *level) return 0; minlevelread++; return 1; } void ioclose_() /* Close files */ { jdbclose_(handle); } void iovalstr_(vn,tmp) int *vn; char *tmp; /* Return string value (tmp) for variable indexed by vn. */ /* val2lineptr is implicit input from ioreadrec_. It points to the */ /* line from the entirely-read-in object that iovalstr_ is getting */ /* values from. A pointer to the line of the "normal" object is */ /* implicit in that its values array is filled by ioreadrec_ from its */ /* current line */ /* Return "nd"s for "other object's" variables in non-matching records */ /* Note that ioreadrec_ controls which non-matching records get re- */ /* turned to outer */ { char *s,*s1; int var_pos_within_its_object; unsigned char var_only_in_1st_obj,var_only_in_2nd_obj,generate_nd; var_only_in_1st_obj = (v[*vn].index == 0); var_only_in_2nd_obj = (v[*vn].index == 1); var_pos_within_its_object = v[*vn].pntr; if (var_only_in_1st_obj) s = values; else if (var_only_in_2nd_obj) s = val2lineptr; else /* Variable in both objects. Decide which one's data we'll use */ if (doing_unmatched_records_2nd_obj) { s = val2lineptr; /* See description of v data structure. */ var_pos_within_its_object = -v[*vn].index-1; } else s = values; if (match_found) generate_nd = FALSE; else generate_nd = (doing_unmatched_records_2nd_obj) ? var_only_in_1st_obj : var_only_in_2nd_obj; if (generate_nd) strcpy (tmp,"nd"); else { s += var_pos_within_its_object*valuesize; strcpy ( tmp, s+strspn(s," ") ); } return; } void iovaldouble_(vn,f) int *vn; double *f; { char tmp[DATUMSIZE+1]; char *endptr; iovalstr_(vn,tmp); *f = strtod(tmp,&endptr); if (*endptr != '\0') *f = -9999.0; return; } void iovalreal_(vn,f) int *vn; float *f; /* Return real value (f) for variable indexed by vn. -9999 for strings */ { double df; iovaldouble_(vn,&df); *f = df; return; } int iovarlevel_(vn) int *vn; /* Return level corresponding to variable indexed by vn. */ { return v[*vn].level; } int ioattrout_(vn,str) int *vn; char *str; /* Output attributes for variable indexed by vn. */ { if (fldwidths_for_attr[*vn] == 0) return 0; sprintf(str,"width=%d",fldwidths_for_attr[*vn]); fldwidths_for_attr[*vn] = 0; return 1; } void ioname_(vn,s) int *vn; char *s; /* Return name (s) corresponding to variable number vn. */ { int obj; obj = (v[*vn].index <= 0) ? 0 : 1; strcpy ( s, names [obj] [v[*vn].pntr] ); return; } int iocommout_(s) char *s; { return 0; } int iowidth_(vn) int *vn; /* "Return length of variable field indexed by vn" [from outer, JGOFS 1.5] */ { return fldwidths[*vn]; } /************************************************************************/ /* Scheme/"interesting" details: */ /* 1) Read 2nd object into memory. */ /* 2) The variable list of the constructed object is the union */ /* of the variable lists of the 2 input objects. The */ /* "match set" of variables is the intersection of the 2 lists */ /* 3) Within each level of the joined object, variables are */ /* arranged as follows: */ /* 1st: variables common to both objects */ /* Next: variables belonging to the first object in the */ /* order they appeared in the first object. */ /* Last: variables belonging to the 2nd object in the */ /* order they appeared in the 2nd object */ /* Common variables are ordered according to their order */ /* in the first object. In all orderings, variables closest */ /* to level 0 precede others. */ /* 4) Records of the 2 objects match when the values for all */ /* variables in the match set have the same value. */ /* Test is numeric if both vals are numeric; string else. */ /* Matching has nothing to do with levels - records that */ /* match are output w/the appropriate levelizing */ /* 5) It is possible that many records from each object have */ /* the same set of values for the match set of variables. */ /* In such cases, every record from the 1st object is */ /* matched with the first record from the 2nd object. Other */ /* records from the 2nd object are considered unmatched and */ /* appear depending on the type of join (item 6, below) */ /* 6) Number of records in constructed object: */ /* a) Inner join: number of records in the 1st object that */ /* have a match with some record in 2nd object. */ /* b) Left outer join: number of records in the 1st object. */ /* If no match is found in the 2nd object, constructed */ /* object variables that come from the 2nd object get */ /* a value of "nd" */ /* c) Right outer join: number of records in the 2nd object. */ /* If no match is found in the 1st object, constructed */ /* object variables that come from the 1st object get */ /* a value of "nd". Unmatched records appear after */ /* all records from the 1st object have been processed */ /* d) (Full) outer join: number of matched records plus */ /* number of unmatched records in both objects. Un- */ /* matched records from the 1st object appear ac- */ /* cording to the rules of 6b. Unmatched records from */ /* the 2nd object appear according to the rules of 6c */ /* The functionality described in 1, 2, 4 (except for string test), */ /* 5, and 6a was in the joinu code as of 30 Sep 98. */