Source code ch.01 code example: mkndx & find in C

Source Code & Software Patents: A Guide to Software & Internet Patent Litigation for Attorneys & Experts
by Andrew Schulman (http://www.SoftwareLitigationConsulting.com)

Code example for Chapter 1: What’s so special (or not so special) about source code?

Code example: mkndx & find in C

mkndx creates an index of lines of source code; find searches in indices created by mkndx

[TODO: show how to use, what output looks like, what index files look like]

[TODO: use strhash.c below to explain data structures, e.g. linked list, table, etc.; note that VBS and awk code merely assume the presence of associated arrays; strhash.c implements them, and strhash.h provides an interface]

[TODO: link to executable files for Windows, Mac OSX; show portion of disassembly; show portion of hexdump]

========

// ndx.c

// TODO:
// change strhash so val is (void *) to be determined by user
// error if can't open file?
// output current count to stderr
// show ls equiv to "dir /s/b /a-d"

#include <stdlib.h>
#include <stdio.h>
#include "strhash.h"

void fail(char *s) { puts(s); exit(1); }

void *alloc(int size)
{
void *p = calloc(size, 1); // 0-init
if (! p) fail("insufficient memory");
return p;
}

HASHTAB files = (HASHTAB) 0;

int get_files(char *s)
{
static int cnt = 0; // only init once!
int this_cnt = 0; // init each time
char *cmd, *buf;
FILE *f;
 
if (! files) // one-time init
     if (! (files = MakeHashTab()))
        fail("Can't create files list");

if (*s == '@')
  {
  f = fopen(s+1, "r");
     if (! f) fail("can't open @file");
  }
   else
  {
      cmd = alloc(1024);
      cmd[0] = '\0';
      strcpy(cmd, "dir /s/b /a-d ");
      strcat(cmd, s);
      strcat(cmd, " > __tmp.tmp"); // TODO: make temp file

         //fprintf(stderr, "get_files: %s\n", cmd);
         system(cmd);
         free(cmd);

  f = fopen("__tmp.tmp", "r");
  if (! f) fail("can't open __tmp.tmp");
  }

 buf = alloc(2048);
 while (fgets(buf, 2047, f))
 {
  if ((! *buf) || (*buf == '#')) // comments in @file
  continue;
  buf[strlen(buf)-1] = '\0'; // remove \n
  cnt++;
  this_cnt++;
  //fprintf(stderr, "get_files: <%s> [%u]\n", buf, cnt);
  SetHashTab(files, buf, cnt); // cnt = fnum
 }
 fclose(f);
 free(buf);

 return this_cnt;
}

int print_func(char *s, int count, int val, unsigned *p)
{
  if (count == 1)
  printf("%u\t%s\t[%u,]\n", count, s, val);
  else
  {
  int i, top;
  printf("%u\t%s\t[", count, s);
  top = min(50, count);
  for (i=0; i<top; i++)
  {
    printf("%u,", p[i]);
  }
  printf("]\n");
 }
return 1;
}

HASHTAB strings = (HASHTAB) 0;

static int gcnt = 0;

int do_file(char *fname, int count, int fnum, unsigned *p)
{
 static int fcnt = 0;
  int cnt = 0;
  FILE *f;
  char *buf;

  fcnt++;
  fprintf(stderr, "%u (%u%%)\t%s\t%u\n", fcnt,
       (100*fcnt) / (1+gcnt), fname, fnum);
  printf("%u\t%s\n", fnum, fname);
  f = fopen(fname, "r");
  // if (! f) fail("can't open file"); // no, keep going
  if (! f) { fprintf(stderr, "\nCan't open <%s>\n\n", fname); return 0; }
  buf = alloc(10240);

  // TODO: need to create temp hashtab, so can do 1 entry per file
  // TODO: maybe change MakeHashTab to take param with tab_size, store

  while (fgets(buf, 10239, f))
  {
  char *s = buf;
  buf[strlen(buf)-1] = '\0'; // remove \n
  while (*s == ' ' || *s == '\t') s++; // remove leading spaces
  if (*s)
  {
        cnt++;
        SetHashTab(strings, s, fnum); // will keep count
  }
  }
  fclose(f);
  free(buf);
  return 1;
}

main(int argc, char *argv[])
{
  unsigned i;

  if (argc < 2)
  fail("usage: ndx [file*.ext or @file]");

  for (i=1; i<argc; i++)
  gcnt += get_files(argv[i]);

  printf("%u files\n", gcnt);

  // ForXInHashtab(files, print_func);

  puts("Files\n");

  strings = MakeHashTab();

  ForXInHashtab(files, do_file);

  fputs("Generating output...\n", stderr);
  puts("\nStrings\n");

  ForXInHashtab(strings, print_func);

  FreeHashTab(files);
  FreeHashTab(strings);
}

==========

strhash.h:

// strhash.h
typedef void *HASHTAB ;

HASHTAB MakeHashTab(void);
void FreeHashTab(HASHTAB hashtab);
void SetHashTab(HASHTAB hashtab, char *s, unsigned val);
int GetHashTab(HASHTAB hashtab, char *s);

typedef int (*FORIN_FUNC)(char *s, int count, unsigned val, unsigned *p);
void ForXInHashtab(HASHTAB hashtab, FORIN_FUNC func);

==========

// strhash.c
// TODO: replace val with count and union of val and void*
// use val when count == 1; in one test (Firefox source), 80% of
// entries appeared only 1x

#include <stdlib.h>
#include <stdio.h>
#include "strhash.h"

typedef struct _node {
  struct _node *next;
  char *s;
  unsigned val;
  } NODE;

void _fail(const char *s) { puts(s); exit(1); }

void *my_alloc(int x, int y)
{
  void *p = calloc(x,y); // zero init
  if (! p)
    _fail("Error: insufficient memory");
  return p;
}

void insert(NODE **list, char *s, unsigned val)
{
  NODE *node;
  NODE *l2 = *list;
  while (l2)
  {
    if (strcmp(l2->s, s) == 0) // could do case-insensitive
    {
      l2->val = val; // took 1 hr. to find bug where omitted this
      return; // already in list
    }
    else
     l2 = l2->next;
  }

  node = (NODE *) my_alloc(1, sizeof(NODE));
  node->next = *list;
  *list = node;
  node->s = my_alloc(1, strlen(s)+1);
  strcpy(node->s, s);
  node->val = val;
}

int find(NODE *list, char *s)
{
  while (list)
  {
   if (*s == *(list->s))
   {
  static int num_strcmp = 0;
  num_strcmp++;
  if ((num_strcmp % 1000000) == 0)
   fprintf(stderr, "\n\n!! %u calls to strcmp\n\n\n", num_strcmp);
  if (strcmp(list->s, s) == 0)
   return list->val;
  else
   list = list->next;
   }
   else
    list = list->next;
  }
  return 0; // valid val begins with 1
}

// prime numbers: 9973 11113 13997 16943 18917
// TODO: do TAB_SIZE and hash make sense together?
#define TAB_SIZE 11113

// see http://www.cs.yorku.ca/~oz/hash.html
unsigned long HASH(unsigned char *s)
{
  unsigned long hash = 5381;
  int c;
  while (c = *s++) {
     hash = ((hash << 5) + hash) + c;
  }  
  return (hash % TAB_SIZE);
}

NODE **mk_hashtab(void) {
  return (NODE **) my_alloc(TAB_SIZE, sizeof(NODE *));
}

HASHTAB MakeHashTab(void) {
  return (HASHTAB) mk_hashtab();
}

void FreeHashTab(HASHTAB hashtab) {
  free(hashtab);
}

void set(NODE **hashtab, char *s, unsigned val) {
  insert(&hashtab[HASH(s)], s, val);
}

void SetHashTab(HASHTAB hashtab, char *s, unsigned val) {
  set((NODE**) hashtab, s, val);
}

int get(NODE **hashtab, char *s) {
  return find(hashtab[HASH(s)], s);
}

int GetHashTab(HASHTAB hashtab, char *s) {
  return get((NODE**)hashtab, s);
}

typedef int (*FORIN_FUNC)(char *s, unsigned val);

void for_in(NODE **hashtab, FORIN_FUNC func)
{
  NODE *list;
  int i;
  for (i=0; i<TAB_SIZE; i++)
  {
 if (list = hashtab[i])
 {
   while (list)
   {
  if (! (*func)(list->s, list->count, list->val, list->p))
   return; // done if func returns 0
  list = list->next;
   }
 }
  }
}

void ForXInHashtab(HASHTAB hashtab, FORIN_FUNC func)
{
  for_in((NODE**) hashtab, func);
}

#ifdef TESTING
int print_func(char *s, unsigned val)
{
  printf("%u\t%s\n", val, s);
  return 1;
}

main(int argc, char *argv[])
{
  NODE **hashtab;
  char buf[10240];
  int i, cnt=0;
 
  hashtab = mk_hashtab();

  while (gets(buf)) // only get up to buf size
  {
    char *s = buf;
    while (*s == ' ' || *s == '\t') s++; // remove leading spaces
    if (*s)
     set(hashtab, s, ++cnt);
  }

  for_in(hashtab, print_func);
}
#endif

=======

find.c:

// find.c

#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <ctype.h>
#include "strhash.h"

void fail(char *s) { puts(s); exit(1); }

int case_sensitive = 0;

char *strlower(char *s) // not in place; make copy
{
    char *s2 = malloc(strlen(s)+1);
    char *p;
    if (! s2) fail("insufficient memory");
    s2[0] = '\0';
    strcpy(s2, s); // unsafe
    for (p=s2; *p; p++) {
      if ((*p >= 'A') && (*p <= 'Z'))
         *p = *p += 0x20; // 'a' - 'A'
        }
    return s2; // caller responsibility to free
}

char *match(char *str1, char *str2)
{
  if (case_sensitive)
   return strstr(str1, str2);
  else
  {
   static char *s2 = (char *) 0;
   char *s1 = strlower(str1);
   char *ret;
   if (! s2) // one time, not going to change
    s2 = strlower(str2); 
  ret = strstr(s1, s2);
   free(s1);
   // free(s2);
   return ret;
  }
}

main(int argc, char *argv[])
{
  HASHTAB hashtab;
  FILE *f;
  char *pat, *ndxfile, *buf, *s;
  typedef char *STRING;
  STRING arr[3];
  int i, cnt;
  int in_files = 0, in_strings = 0;

  if (argc < 3)
   fail("usage: find [pat] [ndxfile]");

  pat = argv[1];
  printf("pat: <%s>\n", pat);
  ndxfile = argv[2];

  if (! (f = fopen(ndxfile, "r"))) fail("can't open ndxfile");
  if (! (hashtab = MakeHashTab())) fail("can't allocate hashtab");
  if (! (buf = calloc(10240, 1)))  fail("insufficient memory");

  while (fgets(buf, 10239, f))
  {
   s = buf;
   if ((! *s) || (*s == '#') || (*s == '\n')) // comments or blank lines
    continue;
   s[strlen(s)-1] = '\0'; // remove \n
   while (*s == ' ' || *s == '\t') s++; // remove leading spaces
   if (! *s)
    continue;

   if (arr[0] = strtok(s, "\t")) cnt = 1;
   if (arr[1] = strtok(NULL, "\t")) cnt = 2;
   if (arr[2] = strtok(NULL, "\t")) cnt = 3;

   if (cnt == 1)
   {
  if (strcmp(arr[0], "Files") == 0) // doh, remember match ret 0
  {
   in_files = 1;
  }
  else if (strcmp(arr[0], "Strings") == 0)
  {
   in_files = 0;
   in_strings = 1;
   fputs("Done with files\n", stderr);
  }
   }

   if (in_files && (cnt == 2)) // TODO: error msg if cnt != 2
   {
  // TODO: need ability to do val -> str,
  // following is hack
  // could also use first line with "%u files" to alloc array
  char *s = malloc(strlen(arr[1])+1);
  if (! s) fail("insufficient memory");
  s[0] = '\0';
  strcpy(s, arr[1]);
  SetHashTab(hashtab, arr[0], (unsigned) s); // not atoi
  //SetHashTab(hashtab, arr[1], atoi(arr[0])); //TODO: need ability to do val->str
   }

   if (in_strings && (cnt == 3) && match(arr[1], pat)) //TODO: err msg if cnt!=3
   {
  int fcount = atoi(arr[0]);
  char *str = arr[1];
  char *fnums = arr[2];
  int len = strlen(fnums);
  int fnum;

  if (*fnums == '[') fnums++;
  if (fnums[len-2] == ']') fnums[len-2] = '\0' ;
  if ((fcount == 1) && (fnums[len-3] == ','))
  {
   fnums[len-3] = '\0';
   printf("%s\n\t%s\n", str, GetHashTab(hashtab, fnums));

  }
  else if (fcount > 1)
  {
    char *fnumstr;
    printf("%s\n", str);
    if (fcount >= 50)
     printf("COMMON STRING: in %u files, including:\n", fcount);
    fnumstr = strtok(fnums, ",");
    do {
     printf("\t%s\n", GetHashTab(hashtab, fnumstr));
    } while (fnumstr = strtok(NULL, ","));
  }
 }
  }

  fclose(f);
  free(buf);
}