Source Code & Software Patents: A Guide to Software & Internet Patent Litigation for Attorneys & Experts
by Andrew Schulman (http://www.SoftwareLitigationConsulting.com)
Code example for Chapter 1: What’s so special (or not so special) about source code?
Code example: mkndx & find in C
mkndx creates an index of lines of source code; find searches in indices created by mkndx
[TODO: show how to use, what output looks like, what index files look like]
[TODO: use strhash.c below to explain data structures, e.g. linked list, table, etc.; note that VBS and awk code merely assume the presence of associated arrays; strhash.c implements them, and strhash.h provides an interface]
[TODO: link to executable files for Windows, Mac OSX; show portion of disassembly; show portion of hexdump]
========
// ndx.c // TODO: // change strhash so val is (void *) to be determined by user // error if can't open file? // output current count to stderr // show ls equiv to "dir /s/b /a-d" #include <stdlib.h> #include <stdio.h> #include "strhash.h" void fail(char *s) { puts(s); exit(1); } void *alloc(int size) { void *p = calloc(size, 1); // 0-init if (! p) fail("insufficient memory"); return p; } HASHTAB files = (HASHTAB) 0; int get_files(char *s) { static int cnt = 0; // only init once! int this_cnt = 0; // init each time char *cmd, *buf; FILE *f; if (! files) // one-time init if (! (files = MakeHashTab())) fail("Can't create files list"); if (*s == '@') { f = fopen(s+1, "r"); if (! f) fail("can't open @file"); } else { cmd = alloc(1024); cmd[0] = '\0'; strcpy(cmd, "dir /s/b /a-d "); strcat(cmd, s); strcat(cmd, " > __tmp.tmp"); // TODO: make temp file //fprintf(stderr, "get_files: %s\n", cmd); system(cmd); free(cmd); f = fopen("__tmp.tmp", "r"); if (! f) fail("can't open __tmp.tmp"); } buf = alloc(2048); while (fgets(buf, 2047, f)) { if ((! *buf) || (*buf == '#')) // comments in @file continue; buf[strlen(buf)-1] = '\0'; // remove \n cnt++; this_cnt++; //fprintf(stderr, "get_files: <%s> [%u]\n", buf, cnt); SetHashTab(files, buf, cnt); // cnt = fnum } fclose(f); free(buf); return this_cnt; } int print_func(char *s, int count, int val, unsigned *p) { if (count == 1) printf("%u\t%s\t[%u,]\n", count, s, val); else { int i, top; printf("%u\t%s\t[", count, s); top = min(50, count); for (i=0; i<top; i++) { printf("%u,", p[i]); } printf("]\n"); } return 1; } HASHTAB strings = (HASHTAB) 0; static int gcnt = 0; int do_file(char *fname, int count, int fnum, unsigned *p) { static int fcnt = 0; int cnt = 0; FILE *f; char *buf; fcnt++; fprintf(stderr, "%u (%u%%)\t%s\t%u\n", fcnt, (100*fcnt) / (1+gcnt), fname, fnum); printf("%u\t%s\n", fnum, fname); f = fopen(fname, "r"); // if (! f) fail("can't open file"); // no, keep going if (! f) { fprintf(stderr, "\nCan't open <%s>\n\n", fname); return 0; } buf = alloc(10240); // TODO: need to create temp hashtab, so can do 1 entry per file // TODO: maybe change MakeHashTab to take param with tab_size, store while (fgets(buf, 10239, f)) { char *s = buf; buf[strlen(buf)-1] = '\0'; // remove \n while (*s == ' ' || *s == '\t') s++; // remove leading spaces if (*s) { cnt++; SetHashTab(strings, s, fnum); // will keep count } } fclose(f); free(buf); return 1; } main(int argc, char *argv[]) { unsigned i; if (argc < 2) fail("usage: ndx [file*.ext or @file]"); for (i=1; i<argc; i++) gcnt += get_files(argv[i]); printf("%u files\n", gcnt); // ForXInHashtab(files, print_func); puts("Files\n"); strings = MakeHashTab(); ForXInHashtab(files, do_file); fputs("Generating output...\n", stderr); puts("\nStrings\n"); ForXInHashtab(strings, print_func); FreeHashTab(files); FreeHashTab(strings); } ========== strhash.h: // strhash.h typedef void *HASHTAB ; HASHTAB MakeHashTab(void); void FreeHashTab(HASHTAB hashtab); void SetHashTab(HASHTAB hashtab, char *s, unsigned val); int GetHashTab(HASHTAB hashtab, char *s); typedef int (*FORIN_FUNC)(char *s, int count, unsigned val, unsigned *p); void ForXInHashtab(HASHTAB hashtab, FORIN_FUNC func); ========== // strhash.c // TODO: replace val with count and union of val and void* // use val when count == 1; in one test (Firefox source), 80% of // entries appeared only 1x #include <stdlib.h> #include <stdio.h> #include "strhash.h" typedef struct _node { struct _node *next; char *s; unsigned val; } NODE; void _fail(const char *s) { puts(s); exit(1); } void *my_alloc(int x, int y) { void *p = calloc(x,y); // zero init if (! p) _fail("Error: insufficient memory"); return p; } void insert(NODE **list, char *s, unsigned val) { NODE *node; NODE *l2 = *list; while (l2) { if (strcmp(l2->s, s) == 0) // could do case-insensitive { l2->val = val; // took 1 hr. to find bug where omitted this return; // already in list } else l2 = l2->next; } node = (NODE *) my_alloc(1, sizeof(NODE)); node->next = *list; *list = node; node->s = my_alloc(1, strlen(s)+1); strcpy(node->s, s); node->val = val; } int find(NODE *list, char *s) { while (list) { if (*s == *(list->s)) { static int num_strcmp = 0; num_strcmp++; if ((num_strcmp % 1000000) == 0) fprintf(stderr, "\n\n!! %u calls to strcmp\n\n\n", num_strcmp); if (strcmp(list->s, s) == 0) return list->val; else list = list->next; } else list = list->next; } return 0; // valid val begins with 1 } // prime numbers: 9973 11113 13997 16943 18917 // TODO: do TAB_SIZE and hash make sense together? #define TAB_SIZE 11113 // see http://www.cs.yorku.ca/~oz/hash.html unsigned long HASH(unsigned char *s) { unsigned long hash = 5381; int c; while (c = *s++) { hash = ((hash << 5) + hash) + c; } return (hash % TAB_SIZE); } NODE **mk_hashtab(void) { return (NODE **) my_alloc(TAB_SIZE, sizeof(NODE *)); } HASHTAB MakeHashTab(void) { return (HASHTAB) mk_hashtab(); } void FreeHashTab(HASHTAB hashtab) { free(hashtab); } void set(NODE **hashtab, char *s, unsigned val) { insert(&hashtab[HASH(s)], s, val); } void SetHashTab(HASHTAB hashtab, char *s, unsigned val) { set((NODE**) hashtab, s, val); } int get(NODE **hashtab, char *s) { return find(hashtab[HASH(s)], s); } int GetHashTab(HASHTAB hashtab, char *s) { return get((NODE**)hashtab, s); } typedef int (*FORIN_FUNC)(char *s, unsigned val); void for_in(NODE **hashtab, FORIN_FUNC func) { NODE *list; int i; for (i=0; i<TAB_SIZE; i++) { if (list = hashtab[i]) { while (list) { if (! (*func)(list->s, list->count, list->val, list->p)) return; // done if func returns 0 list = list->next; } } } } void ForXInHashtab(HASHTAB hashtab, FORIN_FUNC func) { for_in((NODE**) hashtab, func); } #ifdef TESTING int print_func(char *s, unsigned val) { printf("%u\t%s\n", val, s); return 1; } main(int argc, char *argv[]) { NODE **hashtab; char buf[10240]; int i, cnt=0; hashtab = mk_hashtab(); while (gets(buf)) // only get up to buf size { char *s = buf; while (*s == ' ' || *s == '\t') s++; // remove leading spaces if (*s) set(hashtab, s, ++cnt); } for_in(hashtab, print_func); } #endif ======= find.c: // find.c #include <stdlib.h> #include <stdio.h> #include <string.h> #include <ctype.h> #include "strhash.h" void fail(char *s) { puts(s); exit(1); } int case_sensitive = 0; char *strlower(char *s) // not in place; make copy { char *s2 = malloc(strlen(s)+1); char *p; if (! s2) fail("insufficient memory"); s2[0] = '\0'; strcpy(s2, s); // unsafe for (p=s2; *p; p++) { if ((*p >= 'A') && (*p <= 'Z')) *p = *p += 0x20; // 'a' - 'A' } return s2; // caller responsibility to free } char *match(char *str1, char *str2) { if (case_sensitive) return strstr(str1, str2); else { static char *s2 = (char *) 0; char *s1 = strlower(str1); char *ret; if (! s2) // one time, not going to change s2 = strlower(str2); ret = strstr(s1, s2); free(s1); // free(s2); return ret; } } main(int argc, char *argv[]) { HASHTAB hashtab; FILE *f; char *pat, *ndxfile, *buf, *s; typedef char *STRING; STRING arr[3]; int i, cnt; int in_files = 0, in_strings = 0; if (argc < 3) fail("usage: find [pat] [ndxfile]"); pat = argv[1]; printf("pat: <%s>\n", pat); ndxfile = argv[2]; if (! (f = fopen(ndxfile, "r"))) fail("can't open ndxfile"); if (! (hashtab = MakeHashTab())) fail("can't allocate hashtab"); if (! (buf = calloc(10240, 1))) fail("insufficient memory"); while (fgets(buf, 10239, f)) { s = buf; if ((! *s) || (*s == '#') || (*s == '\n')) // comments or blank lines continue; s[strlen(s)-1] = '\0'; // remove \n while (*s == ' ' || *s == '\t') s++; // remove leading spaces if (! *s) continue; if (arr[0] = strtok(s, "\t")) cnt = 1; if (arr[1] = strtok(NULL, "\t")) cnt = 2; if (arr[2] = strtok(NULL, "\t")) cnt = 3; if (cnt == 1) { if (strcmp(arr[0], "Files") == 0) // doh, remember match ret 0 { in_files = 1; } else if (strcmp(arr[0], "Strings") == 0) { in_files = 0; in_strings = 1; fputs("Done with files\n", stderr); } } if (in_files && (cnt == 2)) // TODO: error msg if cnt != 2 { // TODO: need ability to do val -> str, // following is hack // could also use first line with "%u files" to alloc array char *s = malloc(strlen(arr[1])+1); if (! s) fail("insufficient memory"); s[0] = '\0'; strcpy(s, arr[1]); SetHashTab(hashtab, arr[0], (unsigned) s); // not atoi //SetHashTab(hashtab, arr[1], atoi(arr[0])); //TODO: need ability to do val->str } if (in_strings && (cnt == 3) && match(arr[1], pat)) //TODO: err msg if cnt!=3 { int fcount = atoi(arr[0]); char *str = arr[1]; char *fnums = arr[2]; int len = strlen(fnums); int fnum; if (*fnums == '[') fnums++; if (fnums[len-2] == ']') fnums[len-2] = '\0' ; if ((fcount == 1) && (fnums[len-3] == ',')) { fnums[len-3] = '\0'; printf("%s\n\t%s\n", str, GetHashTab(hashtab, fnums)); } else if (fcount > 1) { char *fnumstr; printf("%s\n", str); if (fcount >= 50) printf("COMMON STRING: in %u files, including:\n", fcount); fnumstr = strtok(fnums, ","); do { printf("\t%s\n", GetHashTab(hashtab, fnumstr)); } while (fnumstr = strtok(NULL, ",")); } } } fclose(f); free(buf); }