Source code ch.01 code example: mkndx & find in C

Source Code & Software Patents: A Guide to Software & Internet Patent Litigation for Attorneys & Experts
by Andrew Schulman (http://www.SoftwareLitigationConsulting.com)

Code example for Chapter 1: What’s so special (or not so special) about source code?

Code example: mkndx & find in C

mkndx creates an index of lines of source code; find searches in indices created by mkndx

[TODO: show how to use, what output looks like, what index files look like]

[TODO: use strhash.c below to explain data structures, e.g. linked list, table, etc.; note that VBS and awk code merely assume the presence of associated arrays; strhash.c implements them, and strhash.h provides an interface]

[TODO: link to executable files for Windows, Mac OSX; show portion of disassembly; show portion of hexdump]

========

// ndx.c

// TODO:
// change strhash so val is (void *) to be determined by user
// error if can't open file?
// output current count to stderr
// show ls equiv to "dir /s/b /a-d"

#include <stdlib.h>
#include <stdio.h>
#include "strhash.h"

void fail(char *s) { puts(s); exit(1); }

void *alloc(int size)
{
        void *p = calloc(size, 1); // 0-init
        if (! p) fail("insufficient memory");
        return p;
}

HASHTAB files = (HASHTAB) 0;

int get_files(char *s)
{
        static int cnt = 0; // only init once!
        int this_cnt = 0; // init each time
        char *cmd, *buf;
        FILE *f;
        
        if (! files) // one-time init
                if (! (files = MakeHashTab()))
                        fail("Can't create files list");

        if (*s == '@')
        {
            f = fopen(s+1, "r");
            if (! f) fail("can't open @file"); 
        }
        else
        {
            cmd = alloc(1024);
            cmd[0] = '\0';
            strcpy(cmd, "dir /s/b /a-d ");
            strcat(cmd, s);
            strcat(cmd, " > __tmp.tmp"); // TODO: make temp file

            //fprintf(stderr, "get_files: %s\n", cmd);
            system(cmd);
            free(cmd);

            f = fopen("__tmp.tmp", "r");
            if (! f) fail("can't open __tmp.tmp");
        }

       buf = alloc(2048);
       while (fgets(buf, 2047, f))
       {
        if ((! *buf) || (*buf == '#')) // comments in @file
            continue;
        buf[strlen(buf)-1] = '\0'; // remove \n
        cnt++;
        this_cnt++;
        //fprintf(stderr, "get_files: <%s> [%u]\n", buf, cnt);
        SetHashTab(files, buf, cnt); // cnt = fnum
       }
       fclose(f);
       free(buf);

       return this_cnt;
}

int print_func(char *s, int count, int val, unsigned *p)
{
       if (count == 1)
        printf("%u\t%s\t[%u,]\n", count, s, val);
       else
       {
        int i, top;
        printf("%u\t%s\t[", count, s);
        top = min(50, count);
        for (i=0; i<top; i++)
        {
           printf("%u,", p[i]);
        }
        printf("]\n");
       }
       return 1;
}

HASHTAB strings = (HASHTAB) 0;

static int gcnt = 0;

int do_file(char *fname, int count, int fnum, unsigned *p)
{
    static int fcnt = 0;
    int cnt = 0;
    FILE *f;
    char *buf;

    fcnt++;
    fprintf(stderr, "%u (%u%%)\t%s\t%u\n", fcnt, 
       (100*fcnt) / (1+gcnt), fname, fnum);
    printf("%u\t%s\n", fnum, fname);
    f = fopen(fname, "r");
    // if (! f) fail("can't open file"); // no, keep going
    if (! f) { fprintf(stderr, "\nCan't open <%s>\n\n", fname); return 0; }
    buf = alloc(10240);

    // TODO: need to create temp hashtab, so can do 1 entry per file
    // TODO: maybe change MakeHashTab to take param with tab_size, store

    while (fgets(buf, 10239, f))
    {
        char *s = buf;
        buf[strlen(buf)-1] = '\0'; // remove \n
        while (*s == ' ' || *s == '\t') s++; // remove leading spaces
        if (*s)
        {
            cnt++;
            SetHashTab(strings, s, fnum); // will keep count
        }
    }
    fclose(f);
    free(buf);
    return 1;
}

main(int argc, char *argv[])
{
    unsigned i;

    if (argc < 2)
        fail("usage: ndx [file*.ext or @file]");

    for (i=1; i<argc; i++)
        gcnt += get_files(argv[i]);

    printf("%u files\n", gcnt);

    // ForXInHashtab(files, print_func);

    puts("Files\n");

    strings = MakeHashTab();

    ForXInHashtab(files, do_file);

    fputs("Generating output...\n", stderr);
    puts("\nStrings\n");

    ForXInHashtab(strings, print_func);

    FreeHashTab(files);
    FreeHashTab(strings);
}

==========

strhash.h:

// strhash.h
typedef void *HASHTAB ;

HASHTAB MakeHashTab(void);
void FreeHashTab(HASHTAB hashtab);
void SetHashTab(HASHTAB hashtab, char *s, unsigned val);
int GetHashTab(HASHTAB hashtab, char *s);

typedef int (*FORIN_FUNC)(char *s, int count, unsigned val, unsigned *p);
void ForXInHashtab(HASHTAB hashtab, FORIN_FUNC func);

==========

// strhash.c
// TODO: replace val with count and union of val and void*
// use val when count == 1; in one test (Firefox source), 80% of
// entries appeared only 1x

#include <stdlib.h>
#include <stdio.h>
#include "strhash.h"

typedef struct _node {
    struct _node *next;
    char *s;
    unsigned val;
    } NODE;

void _fail(const char *s) { puts(s); exit(1); }

void *my_alloc(int x, int y)
{
    void *p = calloc(x,y); // zero init
    if (! p)
        _fail("Error: insufficient memory");
    return p;
}

void insert(NODE **list, char *s, unsigned val)
{
    NODE *node;
    NODE *l2 = *list;
    while (l2)
    {
        if (strcmp(l2->s, s) == 0) // could do case-insensitive
        {
                l2->val = val; // took 1 hr. to find bug where omitted this
                return; // already in list
        }
        else
            l2 = l2->next;
    }

    node = (NODE *) my_alloc(1, sizeof(NODE));
    node->next = *list;
    *list = node;
    node->s = my_alloc(1, strlen(s)+1);
    strcpy(node->s, s);
    node->val = val;
}

int find(NODE *list, char *s)
{
    while (list)
    {
       if (*s == *(list->s))
       {
        static int num_strcmp = 0;
        num_strcmp++;
        if ((num_strcmp % 1000000) == 0)
            fprintf(stderr, "\n\n!! %u calls to strcmp\n\n\n", num_strcmp);
        if (strcmp(list->s, s) == 0)
            return list->val;
        else
            list = list->next;
       }
       else
          list = list->next;
    }
    return 0; // valid val begins with 1
}

// prime numbers: 9973 11113 13997 16943 18917
// TODO: do TAB_SIZE and hash make sense together? 
#define TAB_SIZE 11113

// see http://www.cs.yorku.ca/~oz/hash.html
unsigned long HASH(unsigned char *s)
{
    unsigned long hash = 5381;
    int c;
    while (c = *s++) {
            hash = ((hash << 5) + hash) + c;
    }       
    return (hash % TAB_SIZE);
}

NODE **mk_hashtab(void) {
    return (NODE **) my_alloc(TAB_SIZE, sizeof(NODE *));
}

HASHTAB MakeHashTab(void) {
    return (HASHTAB) mk_hashtab();
}

void FreeHashTab(HASHTAB hashtab) {
    free(hashtab);
}

void set(NODE **hashtab, char *s, unsigned val) {
    insert(&hashtab[HASH(s)], s, val);
}

void SetHashTab(HASHTAB hashtab, char *s, unsigned val) {
    set((NODE**) hashtab, s, val);
}

int get(NODE **hashtab, char *s) {
    return find(hashtab[HASH(s)], s);
}

int GetHashTab(HASHTAB hashtab, char *s) {
    return get((NODE**)hashtab, s);
}

typedef int (*FORIN_FUNC)(char *s, unsigned val);

void for_in(NODE **hashtab, FORIN_FUNC func)
{
    NODE *list;
    int i;
   for (i=0; i<TAB_SIZE; i++)
   {
    if (list = hashtab[i])
    {
        while (list)
        {
        if (! (*func)(list->s, list->count, list->val, list->p))
            return; // done if func returns 0
        list = list->next;
        }
    }
   }
}

void ForXInHashtab(HASHTAB hashtab, FORIN_FUNC func)
{
    for_in((NODE**) hashtab, func);
}

#ifdef TESTING
int print_func(char *s, unsigned val)
{
    printf("%u\t%s\n", val, s);
    return 1;
}

main(int argc, char *argv[])
{
    NODE **hashtab;
    char buf[10240];
    int i, cnt=0;
    
    hashtab = mk_hashtab();

    while (gets(buf)) // only get up to buf size
    {
        char *s = buf;
        while (*s == ' ' || *s == '\t') s++; // remove leading spaces
        if (*s)
            set(hashtab, s, ++cnt);
    }

    for_in(hashtab, print_func);
}
#endif

=======

find.c:

// find.c

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "strhash.h"

void fail(char *s) { puts(s); exit(1); }

int case_sensitive = 0;

char *strlower(char *s) // not in place; make copy
{
        char *s2 = malloc(strlen(s)+1);
        char *p;
        if (! s2) fail("insufficient memory");
        s2[0] = '\0';
        strcpy(s2, s); // unsafe
        for (p=s2; *p; p++) {
                if ((*p >= 'A') && (*p <= 'Z'))
                        *p = *p += 0x20; // 'a' - 'A'
        }
        return s2; // caller responsibility to free
}

char *match(char *str1, char *str2)
{
    if (case_sensitive)
        return strstr(str1, str2);
    else
    {
        static char *s2 = (char *) 0;
        char *s1 = strlower(str1);
        char *ret;
        if (! s2) // one time, not going to change
            s2 = strlower(str2);
        ret = strstr(s1, s2);
        free(s1);
        // free(s2);
        return ret;
    }
}

main(int argc, char *argv[])
{
    HASHTAB hashtab;
    FILE *f;
    char *pat, *ndxfile, *buf, *s;
    typedef char *STRING;
    STRING arr[3];
    int i, cnt;
    int in_files = 0, in_strings = 0;

    if (argc < 3)
        fail("usage: find [pat] [ndxfile]");

    pat = argv[1];
    printf("pat: <%s>\n", pat);
    ndxfile = argv[2];

    if (! (f = fopen(ndxfile, "r"))) fail("can't open ndxfile");
    if (! (hashtab = MakeHashTab())) fail("can't allocate hashtab");
    if (! (buf = calloc(10240, 1)))  fail("insufficient memory");

    while (fgets(buf, 10239, f))
    {
        s = buf;
        if ((! *s) || (*s == '#') || (*s == '\n')) // comments or blank lines
            continue;
        s[strlen(s)-1] = '\0'; // remove \n
        while (*s == ' ' || *s == '\t') s++; // remove leading spaces
        if (! *s)
            continue;

        if (arr[0] = strtok(s, "\t")) cnt = 1;
        if (arr[1] = strtok(NULL, "\t")) cnt = 2;
        if (arr[2] = strtok(NULL, "\t")) cnt = 3;

       if (cnt == 1)
       {
        if (strcmp(arr[0], "Files") == 0) // doh, remember match ret 0
        {
            in_files = 1;
        }
        else if (strcmp(arr[0], "Strings") == 0)
        {
            in_files = 0;
            in_strings = 1;
            fputs("Done with files\n", stderr);
        }
       }

       if (in_files && (cnt == 2)) // TODO: error msg if cnt != 2
       {
        // TODO: need ability to do val -> str,
        // following is hack
        // could also use first line with "%u files" to alloc array
        char *s = malloc(strlen(arr[1])+1);
        if (! s) fail("insufficient memory");
        s[0] = '\0';
        strcpy(s, arr[1]);
        SetHashTab(hashtab, arr[0], (unsigned) s); // not atoi
        //SetHashTab(hashtab, arr[1], atoi(arr[0])); //TODO: need ability to do val->str
       }

       if (in_strings && (cnt == 3) && match(arr[1], pat)) //TODO: err msg if cnt!=3
       {
        int fcount = atoi(arr[0]);
        char *str = arr[1];
        char *fnums = arr[2];
        int len = strlen(fnums);
        int fnum;

        if (*fnums == '[') fnums++;
        if (fnums[len-2] == ']') fnums[len-2] = '\0' ;
        if ((fcount == 1) && (fnums[len-3] == ','))
        {
            fnums[len-3] = '\0';
            printf("%s\n\t%s\n", str, GetHashTab(hashtab, fnums));

        }
        else if (fcount > 1)
        {
            char *fnumstr;
            printf("%s\n", str);
            if (fcount >= 50)
                printf("COMMON STRING: in %u files, including:\n", fcount);
            fnumstr = strtok(fnums, ",");
            do {
                printf("\t%s\n", GetHashTab(hashtab, fnumstr));
            } while (fnumstr = strtok(NULL, ","));
        }
       }
    }

    fclose(f);
    free(buf);
}


Print Friendly, PDF & Email