// Stealing addresses from Google!!! #include #include #include #include char *urls[5] = { "http://finance.google.com/finance?catid=66529330", "http://finance.google.com/finance?start=20&num=40&catid=66529330", "http://finance.google.com/finance?start=40&num=60&catid=66529330", "http://finance.google.com/finance?start=60&num=80&catid=66529330", "http://finance.google.com/finance?start=80&num=100&catid=66529330"}; const int FILE_BUFFER_SIZE = 1024*1024; char bFileBuffer[FILE_BUFFER_SIZE]; DWORD GetPage(char* url, char* buff, DWORD buffsize) { DWORD dReadFileCount = 0; char *b=buff; DWORD read=buffsize; HINTERNET hSession = InternetOpen ("TechMap", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0); if (hSession) { HINTERNET hSource = InternetOpenUrl(hSession, url, NULL, 0, INTERNET_FLAG_RAW_DATA | INTERNET_FLAG_NO_CACHE_WRITE | INTERNET_FLAG_DONT_CACHE | INTERNET_FLAG_KEEP_CONNECTION, 0); if (hSource) { while ( InternetReadFile( hSource, b, read, &dReadFileCount ) ) { if ( dReadFileCount == 0 ) break; b += dReadFileCount; read -= dReadFileCount; } InternetCloseHandle (hSource); } InternetCloseHandle (hSession); } return DWORD(b-buff); } char* FindTag(char* tag, char* buff, char* end, bool endtag = false) { #define ENDTAG true char *b=buff; char tagbuff[1024]; size_t len = strlen(tag); // printf("\tLooking for %s @ %x\n", tag, b); while (1) { while (b < end && *b != '<') {b++;}; b++; // Skip '<' char *tag_start = b; while (b < end && (*b == ' ' || *b == '\t')) {b++;}; char *t = tagbuff; while (b < end && (isalpha(*b) || (*b) == '/' || (*b) == '_')) {*t++ = *b++;}; *t = '\0'; if(*b == '!') { continue; } // printf("Found thistag %s @ %x\n", thistag, b); if(strcmp("br", tagbuff) == 0) { b++; continue; } if(strcmp("img", tagbuff) == 0) { b++; continue; } if(strcmp("input", tagbuff) == 0) { b++; continue; } if(strcmp("/b", tagbuff) == 0) { b++; continue; } if(strcmp("/tbody", tagbuff) == 0) { b++; continue; } if ((!endtag && strcmp(tagbuff, tag) == 0) || (endtag && strcmp(tagbuff+1, tag) == 0 && tagbuff[0] == '/')) { return b-len; } else { if(tagbuff[0] == '/') { return tag_start-1; } b = FindTag(tagbuff, b, end, ENDTAG); } } } int FindTagArray(char** tags, size_t* skips, int numtags, char* buff, char* end, int (*callback)(char* b, char* end)) { char* b = buff; for (int i = 0; iaddr, "http://finance.google.com"); char *str = comp->addr + strlen(comp->addr); while (tag_start < b-1 && *tag_start != '"') *str++ = *tag_start++; *str = 0; while (tag_start < b-1 && *tag_start != '>') tag_start++; tag_start++; str = comp->name; while (tag_start < b-1) *str++ = *tag_start++; *str = 0; b = FindTag("/td", b, end); b = FindTag("td", b, end); b = FindTag("/td", b, end); b = FindTag("td", b, end); b = FindTag("/td", b, end); b = FindTag("td", b, end); b = FindTag("/td", b, end); b = FindTag("td", b, end); tag_start = b; b = FindTag("/td", b, end); while (tag_start < b-2 && *tag_start != '>') tag_start++; tag_start++; while (tag_start < b-2 && !isdigit(*tag_start)) tag_start++; str = comp->marketCap; while (tag_start < b-2) *str++ = *tag_start++; *str = 0; b = FindTag("tr", b, end, ENDTAG); b += 4; } return 0; } int main(int argc, char ** argv) { for (int i = 0; i < 5; i++) { DWORD size = GetPage(urls[i], bFileBuffer, FILE_BUFFER_SIZE); char *b = bFileBuffer; char *end = bFileBuffer + size; const int NUM_TABS = 4; char *tags[NUM_TABS] = {"html", "body", "table", "tr"}; size_t skips[NUM_TABS] = {0, 0, 2, 2}; FindTagArray(tags, skips, NUM_TABS, b, end, GetCompanies); } // printf("\n"); for (i = 0; i < 100; i++) { GetAddress(companies[i].addr, companies[i].addr); // printf("Company %i: %s, %s, %s\n", i, companies[i].name, companies[i].marketCap, companies[i].addr); printf("%s\n", companies[i].addr); // printf("\t\n", // companies[i].name, i, companies[i].marketCap, companies[i].addr); } // printf("\n"); return 0; }