Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

93
LINES

< > TinyBrain | #1008015 - Simple Wikipedia Bot [WORKS, downloads 127 MB, 700 MB on disk, 1 GB in memory]

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Uses 2369K of libraries. Click here for Pure Java version (7286L/52K/172K).

!7

static new TreeMap<U, U> wiki; // title to text (stored as compressed strings)

sS processKey(S s) { ret toUpper(s); }

p-subst { time {
  restartWith1GBHeap();
  quietGC(); // don't litter console with GC messages
  veryBigConsole();
  setConsoleWidth(900);
  centerConsole();
  consoleWordWrap();
  File f = unpackSimpleWikipedia();
  BufferedReader reader = utf8bufferedReader(f);
  S line;
  int lines = 0, pages = 0;
  StringBuilder pageBuf = null;
  while ((line = reader.readLine()) != null) {
    line = trim(line);
    if (eq(line, "<page>"))
      pageBuf = new StringBuilder;
    if (pageBuf != null)
      pageBuf.append(line).append("\n");
    if (eq(line, "</page>")) {
      //print("Page done. " + l(pageBuf) + " chars");
      L<S> tok = htmlTok(str(pageBuf));
      S title = htmldecode(join(contentsOfContainerTag(tok, "title")));
      S text = trim(htmldecode(join(contentsOfContainerTag(tok, "text"))));
      if (!shouldSkip(title) && !empty(text)) {
        U key = new U(processKey(title));
        U old = wiki.get(key);
        S red = wikipedia_getRedirect(text);
        if (!eqic(red, title) && !eqic(text, str(old))) {
          /*if (old != null) {
            print("Double entry: " + title);
            print("  " + quote(str(old)));
            print("  " + quote(text));
          }*/
          if (old == null || wikipedia_getRedirect(str(old)) != null)
            wiki.put(key, new U(text));
        }
      }
      if ((++pages % 1000) == 0) {
        fractionDone(pages/228400.0);
        print("Pages: " + pages + " (" + title + ")");
        sleep(1);
      }
      pageBuf = null;
    }
  }
  }
  fractionDone(1);
  
  swing {
    JList list = jlist(allToString(keys(wiki)));
    addToWindowSplitRight(consoleFrame(), list);
    onDoubleClick(list, func(S item) {
      answer(item)
    });
  }
  
  // print a random entry
  answer(str(random(keys(wiki))));
  botSleep();
}

answer {
  U u = followRedirect(nicestClosestKey(wiki, new U(processKey(s))));
  if (u != null) {
    clearConsole();
    S title = toUpper(str(u));
    consoleStatus(title);
    print(title);
    print();
    print(dropContainerTags(str(wiki.get(u)))); // drop <ref>
    scrollConsoleUpIn(0.5);
    ret " ";
  }
}

static U followRedirect(U key) {
  U next;
  int count = 0;
  while ((next = toU(processKey(wikipedia_getRedirect(str(wiki.get(key)))))) != null && ++count < 10)
    key = next;
  ret key;
}

sbool shouldSkip(S title) {
  ret swic(title, "Category:") || swic(title, "Template:");
}

Author comment

// stuff to evaluate in "assist" (speed test for full-text searches)
// !j twice { time { for (O u : values((Map) get(mmc(), "wiki"))) words2(str(u)); } }
// !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (cic(str(u), "hello")) ++n; }} ret n;
// !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (contains(str(u), "hello")) ++n; }} ret n;
// !j int n = 0; twice { time { n = 0; for (O u : values((Map) get(mmc(), "wiki"))) if (cicFast(str(u), "hello")) ++n; }} ret n;

download  show line numbers  debug dex   

Travelled to 4 computer(s): cfunsshuasjs, onxytkatvevr, tvejysmllsmz, wtqryiryparv

No comments. add comment

Snippet ID: #1008015
Snippet name: Simple Wikipedia Bot [WORKS, downloads 127 MB, 700 MB on disk, 1 GB in memory]
Eternal ID of this version: #1008015/99
Text MD5: 3a85e50740e9f98d27af6d64c96d418a
Transpilation MD5: 81847932c5694a244704d48498e046f3
Author: stefan
Category: javax / a.i. / networking
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2017-05-04 14:39:14
Source code size: 2615 bytes / 93 lines
Pitched / IR pitched: No / No
Views / Downloads: 253 / 1108
Version history: 98 change(s)
Referenced in: [show]