#define NULL 0 #define MAX(x,y) (((x)<(y))?(y):(x)) #define MIN(x,y) (((x)<(y))?(x):(y)) #include // ::memchr /** Andrew Fiori attempted implementation of cs240 substring algorithm for arbitrary binary data. [TODO] This code should be severely audited before production use -it would be nice if someone other than me could figure out how it works by reading it -and convince themselves that it works Consider Memory use optimizations -use shorts (or chars {this could be tricky}) rather than longs -do a hash of chars for char skip (reduce array size, could pe a parameter) -parameter for MAXLITTLE, usable on precomputed version -or enforce MAXLITTLE on all functions -recursize depth cutoff (recurse no more than parameter) Consider speed optimizations -improved partial match (bonus marks for the one who figures out what this means) Look for a byte sequence in a block of data algorithm should be better than O(lBig) -which is the expected for random binary input of the nieve search algorithm {on truly random data should be more like O(2*lBig/(max(256,lLittle)) this is based on the limited analysis actually done} -for well crafted data it might be as bad as O(lBig*lLittle) which is again the actual upper bound on nieve algorithm -the algorithm resorts to nieve were the cost of precomputation "might" end up exceeding the cost of nieve. this guarentees the algorithm can never be worse than O(nieve) -i did not fully compute the cost of precomputation but it should be no worse than O(lLittle^2) {L*log L is more likely closer to accurate} if you are going to be scanning for the same sequence alot you should use the precompute version and keep passing in the same precomputed values. -you need to delete them when you are done premice of algorithm (we look for little in big): -try to match the byte sequence backwards -if we fail to match on a character use heuristics to see where we should start trying to match again -heuristic one: Character jump, if we fail match and find in that location in big character x, we can shift to allign that location with last occurance of x in little. if x does not appear the last occurance is one character before start of little (expected jump of 128 on random data) -heuristic two Partial match, given that we have matched the last n bytes of the file where is the last time (excluding the end) that that series of 9 bytes appeares in little, we should shift to line those up (also check for if part of those n bytes is a prefix) (in truly random data the last n bytes should probobly never reappear allowing skipping whole string) -in the end we shift by the larger of the two (partial match is quarenteed to be at least 1) -both heuristics are precomputed and depend only on little hence the same precomputed values can be reused for subsequent searches for little in any other binary sequence. -Note: the precomputation algorithm for partial match is recursize as it calls memnmem -also the precomputation results in allocations of an array of 256 longs and of lLittle longs on the heap -the algorithm is likely best if little is fairly long and not paterned. -random data is actually far better. the pplskip, pplcharskip can be precomputed and reused for a little lLittle pair memnmem will create them if you pass in NULL pointers to it (that is *pplskip == NULL not pplskip == NULL) -you should delete these eventually */ /** look for first occurance of little (length lLittle) in big (length lBig) preskip,precharskip are essentially helper functions but can be used to create arrays, though it is recomeneded you just allow memnmem to create them by passing in NULL *pplcharskip, *pplskip */ char * memnmem(const char * big, long lBig, const char * little, long lLittle); char * memnmem(const char * big, long lBig, const char * little, long lLittle, long ** pplcharskip, long ** pplskip); void preskip(const char * little, long lLittle, long ** pplskip); void precharskip(const char * little, long lLittle,long ** pplcharskip); /** Backwards version of the above (finds last occurance, searches from the back) most usefull sinse they are used in doing the precomputation for the above */ char * memnrmem(const char * big, long lBig, const char * little, long lLittle); char * memnrmem(const char * big, long lBig, const char * little, long lLittle, long ** pplcharskip, long ** pplskip); void prerskip(const char * little, long lLittle, long ** pplskip); void prercharskip(const char * little, long lLittle,long ** pplcharskip); /** Simple (ie nieve) version of memnmem that resorts to memchr (good if little is small) memnmems would win for lLittle <= 4 */ char * memnmems(const char * big, long lBig, const char * little, long lLittle); char * memnrmems(const char * big, long lBig, const char * little, long lLittle); /** memrchr a reasonable implementation of memrchr */ void * memrchr(const void * pv, int c, long lLength); //The maximum size of data we will consider doing precomputation on // when precomputation and storage not explicitly requested // (ie the 4 parameter version, the 6 parameter version has no such limit) // this excess memory usage #define MAXLITTLE 1000 /** memnmem searches forward for first occurance of little in big O(B/L) + O(L^2) expected */ char * memnmem(const char * big, long lBig, const char * little, long lLittle) { if(big == NULL || lBig < lLittle){ return NULL; }else if(little == NULL || lLittle <= 0){ return (char *) big; }else if(lLittle < 5){ //do a lower bound on when to actually do the elaborate search return memnmems(big,lBig,little,lLittle); }else if(MAXLITTLE < lLittle){ //we dont want to do the calculations on something that big (too much memory) // instead just look for the first MAXLITTLE substring repeatedly // untill we find one where the whole matches char * pcTarget = (char *) (big-1); long * pl1 = NULL; long * pl2 = NULL; while(true){ pcTarget = (char *)::memnmem(pcTarget+1, MAX(0,(long)(big+lBig-pcTarget-lLittle+MAXLITTLE)), little, MAXLITTLE,&pl1,&pl2); if(pcTarget == NULL || ::memcmp(pcTarget,little,lLittle) == 0){ break; } } delete[] pl1; delete[] pl2; return pcTarget; } long * pl1 = NULL; long * pl2 = NULL; char * result = memnmem(big,lBig,little,lLittle,&pl1,&pl2); delete[] pl1; delete[] pl2; return result; } /** pplcharskip is a pointer to a length 256 array of longs O(L) */ void precharskip(const char * little, long lLittle,long ** pplcharskip) { if(lLittle <= 0){ memset(*pplcharskip,0,256*sizeof(long)); return; } long * plcharskip = *pplcharskip; for(int i = 0; i < 256; i++){ plcharskip[i] = 1; } for(int i = 0; i < lLittle; i++){ plcharskip[(unsigned char) little[i] ]= -i; } } /** preskip pplskip is pointer to an array length lLittle *pplskip contains how much to shift if fail at certain position (for forward scan) -based on matching suffixes O(L^2) on assumption that memnrmem is O(B) at worst */ void preskip(const char * little, long lLittle, long ** pplskip) { if(lLittle <= 1){ if(lLittle == 1){ **pplskip = 1; } return; } long * plskip = *pplskip; bool bfirstfail = false; char * pcPrev = (char *) little+lLittle-1; plskip[lLittle-1] = 1; for(long l = lLittle-1; l > 0; l--){ plskip[l-1] = 1; if(bfirstfail || ((pcPrev = memnrmem(little,pcPrev-little+lLittle-l-1,little+l,lLittle-l)) == NULL)){ //never appears as a substring plskip[l-1] = l+1; if(!bfirstfail){ //only on first fail do we need to check full tail for(long ll = lLittle-l-1; ll > 0; ll--){ if(memcmp(little,little+lLittle-ll,ll) == 0){ break; }else{ plskip[l-1]++; } } }else if(memcmp(little,little+l+1,lLittle-l-1) != 0){ //we checked the rest of the tail last time // and the whole tail does not match so same as last time // had we done whole loop initial plskip[l-1] starts 1 below last time // but iterate 1 extra to find it plskip[l-1]=plskip[l]; } bfirstfail=true; continue; } plskip[l-1] = (long)(little + l - pcPrev); } } /** memnmem plcharskip precomputed 256 array of longs pplskip is pointer to precomputed array length lLittle -1 if not these will be created and returned O(B/L) expected with precomputation O(B) worst case O(B/L) + O(L^2) expected without procomputation O(B + L^2) worst case */ char * memnmem(const char * big, long lBig, const char * little, long lLittle, long ** pplcharskip, long ** pplskip) { long * plcharskip; long * plskip; if(*pplcharskip != NULL){ plcharskip = *pplcharskip; }else{ plcharskip = new long[256]; *pplcharskip = plcharskip; precharskip(little,lLittle,pplcharskip); } if(*pplskip != NULL){ plskip = *pplskip; }else{ plskip =new long[lLittle]; *pplskip = plskip; preskip(little,lLittle,pplskip); } if(lLittle <= 0){ return (char *) big; }else if(lBig <= 0){ return NULL; } if(lLittle < 5){ //lLittle is small enough to knock out any gains we might expect //we precomputed anyways (this was trivial) because the interface says we should return memnmems(big,lBig,little,lLittle); } const char * pkcTarget = big; const char * pkcEnd = big+lBig-lLittle; while(pkcTarget <= pkcEnd){ long l; for(l = lLittle-1; pkcTarget[l] == little[l]; l--){ if(l==0){ return (char *)pkcTarget; } } pkcTarget+= MAX(plcharskip[(unsigned char)pkcTarget[l]] + l,plskip[l]); } return NULL; } /** memnrmem performs the reverse scan O(B/L) + O(L^2) expected without procomputation O(B + L^2) worst case */ char * memnrmem(const char * big, long lBig, const char * little, long lLittle) { if(big == NULL || lBig < lLittle){ return NULL; }else if(little == NULL || lLittle <= 0){ return (char *) big+lBig-1; }else if(lLittle < 5){ //do a lower bound on when to actually do the elaborate search //TODO figure out the correct lower bound (rather than 5) return memnrmems(big,lBig,little,lLittle); }else if(MAXLITTLE < lLittle){ char * pcTarget = (char *) (big+lBig-lLittle); long * pl1 = NULL; long * pl2 = NULL; while(true){ pcTarget = (char *)::memnrmem(big, MAX(0,(long)(pcTarget-big+MAXLITTLE)), little, MAXLITTLE,&pl1,&pl2); if(pcTarget == NULL || ::memcmp(pcTarget,little,lLittle) == 0){ break; } } delete[] pl1; delete[] pl2; return pcTarget; } long * pl1 = NULL; long * pl2 = NULL; char * result = memnrmem(big,lBig,little,lLittle,&pl1,&pl2); delete[] pl1; delete[] pl2; return result; } /** memnrmem performs the reverse scan using the precomputed values if possible (creating them if not) O(B/L) expected with precomputation O(B) worst case O(B/L) + O(L^2) expected without procomputation O(B + L^2) worst case */ char * memnrmem(const char * big, long lBig, const char * little, long lLittle, long ** pplcharskip, long ** pplskip) { long * plcharskip; long * plskip; if(*pplcharskip != NULL){ plcharskip = *pplcharskip; }else{ plcharskip = new long[256]; *pplcharskip = plcharskip; prercharskip(little,lLittle,pplcharskip); } if(*pplskip != NULL){ plskip = *pplskip; }else{ plskip =new long[lLittle]; *pplskip = plskip; prerskip(little,lLittle,pplskip); } if(lLittle <= 0){ return (char *) big+lBig-1; }else if(lBig <= 0){ return NULL; } char * pcTarget = (char *)big+lBig-lLittle; while(pcTarget >= big){ long l; for(l = 0; pcTarget[l] == little[l]; l++){ if(l==lLittle-1){ return pcTarget; } } pcTarget-=MAX(plcharskip[(unsigned char) pcTarget[l]] - l,plskip[l]); } return NULL; } /** prerskip precompute the shift if we fail after matching so far in (for reverse scan) *pplskip is an array containing the amount to shift so that the correctly matched portion may be lined up correctly. O(L^2) on assumption that memnmem is O(B) at worst */ void prerskip(const char * little, long lLittle, long ** pplskip) { if(lLittle <= 1){ if(lLittle == 1){ **pplskip = 1; } return; } long * plskip = *pplskip; bool bfirstfail = false; char * pcPrev = (char *) little+1; plskip[0] = 1; for(long l = 0; l < lLittle -1; l++){ plskip[l+1] = 1; if(bfirstfail || ((pcPrev = memnmem(pcPrev,little+lLittle-pcPrev,little,l+1)) == NULL)){ //never appears as a substring plskip[l+1] = lLittle-l; if(!bfirstfail){ //only on first fail do we need to check full tail for(long ll = l; ll > 0; ll--){ if(memcmp(little,little+lLittle-ll-1,ll+1) == 0){ break; }else{ plskip[l+1]++; } } }else if(memcmp(little,little+lLittle-l-1,l+1) != 0){ //we checked the rest of the tail last time // and the whole tail does not match so same as last time // had we done whole loop initial plskip[l+1] starts 1 below last time // but iterate 1 extra to find it plskip[l+1]=plskip[l]; } bfirstfail=true; continue; } plskip[l+1] = (long)(pcPrev-little); } } /** prercharskip precompute the character jump heuristics values for a reverse scan *pplcharskip will be an array indicating how far from front the first occurance of a character is O(lLittle) */ void prercharskip(const char * little, long lLittle,long ** pplcharskip) { if(lLittle <= 0){ memset(*pplcharskip,0,256*sizeof(long)); return; } long * plcharskip = *pplcharskip; for(int i = 0; i < 256; i++){ plcharskip[i] = lLittle; } for(int i = lLittle-1; i >= 0; i--){ plcharskip[(unsigned char) little[i] ]= i; } } /** memnmems simple memchr memcmp scan */ char * memnmems(const char * big, long lBig, const char * little, long lLittle) { if(lLittle == 0) return (char *)big; char * pcTarget = (char *) (big-1); while(true){ pcTarget = (char *)::memchr(pcTarget+1,(unsigned char) little[0],MAX(0,(long)(big+lBig-pcTarget-lLittle))); if(pcTarget == NULL || ::memcmp(pcTarget,little,lLittle) == 0){ break; } } return pcTarget; } /** memnrmems simple memrchr memcmp scan */ char * memnrmems(const char * big, long lBig, const char * little, long lLittle) { if(lLittle == 0) return (char *) (big+lBig); char * pcTarget = (char *) (big+lBig-lLittle+1); while(true){ pcTarget = (char *)::memrchr(big,(unsigned char) little[0],MAX(0,(long)(pcTarget-big))); if(pcTarget == NULL || ::memcmp(pcTarget,little,lLittle) == 0){ break; } } return pcTarget; } /** memrchr a reasonable implementation of memrchr */ void * memrchr(const void * pv, int c, long lLength) { unsigned char uc = (unsigned char) c; unsigned char * puc = (unsigned char *)pv+lLength; while(puc > pv){ if(*(--puc) == uc){ return puc; } } return NULL; } /** maxmatch finds the longest section of big (length lBig) that is a prefix of little (no longer than lLittle) *max will be a pointer to it *plMax will be its length these will be big/zero respectively if no match is found */ char * maxmatch(const char * big, long lBig, const char * little, long lLittle, char ** max, long * plMax) { *max = (char *) big; *plMax = 0; char * next; while((*plMax < lLittle) && (next = memnmem(*max+1,big+lBig-*max-1,little,*plMax+1)) != NULL){ *max = next; (*plMax)++; for(;(*plMax < lLittle) && (*max+*plMax < big+lBig)&&(*max)[*plMax] == little[*plMax];(*plMax)++); } return *max; }