Not logged in.  Login/Logout/Register | List snippets | | Create snippet | Upload image | Upload data

159
LINES

< > TinyBrain | #1002282 - An NL Parser (developing)

JavaX source code [tags: use-pretranspiled] - run with: x30.jar

Libraryless. Click here for Pure Java version (1862L/12K/42K).

!752

static MultiMap<S, L<S>> productionMap = new MultiMap;

static boolean debug = false;

p {
  S rulesText = loadSnippet("#1002281");
  S inputText = loadSnippet("#1002286") + "\n" + loadSnippet("#1002280");
  S mainProd = "line";
  
  for (S rule : toLinesFullTrim(rulesText)) pcall {
    printF("Processing rule: *", rule);
    L<S> lr = splitAtJavaToken(rule, "=");
    if (l(lr) != 2) {
      print("Weird rule: " + rule);
      continue;
    }
    S l = lr.get(0), r = lr.get(1);
    L<S> tokr = javaTok(r);
    assertEquals(structure(tokr), 3, l(tokr));
    S className = assertIdentifier(get(tokr, 1));
    L<S> tok = javaTok(l);
    tok = mergeBracketThingies(tok);
    //printStructure(tok);
    productionMap.put(className, tok);
  }
  
  print(n(productionMap.size(), "production") + ".");
  print();
  
  for (S line : toLinesFullTrim(inputText)) {
    print(line);
    L<S> tok = javaTok(line);
    Pos pos = new Pos(tok);
    if (parseClass(pos, mainProd) != null)
      print("  parsed");
    else
      print("  not parsed");
  }
}

static class Pos {
  L<S> tok;
  int i = 1;
  
  *() {}
  *(L<S> *tok) {}
  *(L<S> *tok, int *i) {}
  
  boolean end() { ret i >= l(tok)-1; }
  public Pos clone() { ret new Pos(tok, i); }
  public boolean equals(O o) {
    if (!(o instanceof Pos)) ret false;
    Pos pos = cast o;
    ret tok == pos.tok && i == pos.i;
  }
  
  S rest() {
    ret join(subList(tok, i));
  }
}

static void copy(Pos a, Pos b) {
  b.tok = a.tok;
  b.i = a.i;
}

static void debug(S bla, Pos pos) {
  if (debug)
    print(bla + " on " + quote(pos.rest()));
}

// endless loop detector
static Pos haltPos;
static new HashSet<S> haltClasses;

static O parseClass(Pos pos, S name) {
  if (debug) debug("parseClass " + name, pos);
  if (checkHalt(pos, name)) ret null;
  L<L<S>> prods = productionMap.get(name);
  if (empty(prods)) ret null; // weird, unknown class name
  
  for (L<S> prod : prods) {
    Pos _pos = pos.clone();
    O x = parseProd(_pos, prod);
    if (x != null) { copy(_pos, pos); ret x; }
  }
  
  ret null;
}

// returns true if we should halt because of endless looping
static boolean checkHalt(Pos pos, S className) {
  if (!eq(haltPos, pos)) {
    haltPos = pos.clone();
    haltClasses = lithashset(className);
    return false;
  } else {
    if (haltClasses.contains(className)) {
      if (debug)
        print("Endless loop: " + structure(pos) + " " + structure(haltClasses));
      ret true;
    } else {
      haltClasses.add(className);
      print("checkHalt: same pos, classes now: " + structure(haltClasses));
      ret false;
    }
  }
}

static O parseProd(Pos pos, L<S> prod) {
  if (debug)
    debug("parseProd " + structure(prod), pos);
  
  for (int i = 1; i < l(prod); i += 2) {
    S p = prod.get(i);
    S t = pos.tok.get(pos.i);
    if (isBracketedID(p)) {
      Pos _pos = pos.clone();
      O x = parseClass(_pos, unbracket(p));
      if (x == null) ret null;
      copy(_pos, pos);
      // keep parsing production
    } else {
      // it's a literal
      if (pos.end()) ret null; // need a token to match
      if (!(eq(p, "*") || eqic(p, t)))
        ret null; // token mismatch
      pos.i += 2; // consume & keep parsing
    }
  }
  
  if (debug)
    debug("ok " + structure(prod), pos);
  ret true; // production succeeded
}

static boolean isBracketedID(S s) {
  ret s.startsWith("<") && s.endsWith(">");
}

static S unbracket(S s) {
  ret isBracketedID(s) ? s.substring(1, l(s)-1) : s;
}

// angle bracket things like <quoted>
static L<S> mergeBracketThingies(L<S> tok) {
  tok = cloneList(tok);
  for (int i = 1; i+4 < l(tok); i += 2)
    if (eq(get(tok, i), "<") && eq(get(tok, i+1), "") && isIdentifier(get(tok, i+2)) && eq(get(tok, i+3), "") && eq(get(tok, i+4), ">")) {
      tok.set(i, "<" + tok.get(i+2) + ">");
      tok.remove(i+4);
      tok.remove(i+3);
      tok.remove(i+2);
      tok.remove(i+1);
    }
  ret tok;
}

download  show line numbers  debug dex   

Travelled to 8 computer(s): cfunsshuasjs, gwrvuhgaqvyk, ishqpsrjomds, mqqgnosmbjvj, onxytkatvevr, teubizvjbppd, tslmcundralx, tvejysmllsmz

No comments. add comment

Snippet ID: #1002282
Snippet name: An NL Parser (developing)
Eternal ID of this version: #1002282/1
Text MD5: 4fd683174302221441cdb839c8e2412e
Transpilation MD5: 8886016cc6fb751a4b2a57ffd6937c5c
Author: stefan
Category: javax
Type: JavaX source code
Public (visible to everyone): Yes
Archived (hidden from active list): No
Created/modified: 2016-01-02 20:35:13
Source code size: 4074 bytes / 159 lines
Pitched / IR pitched: No / Yes
Views / Downloads: 280 / 271
Referenced in: [show]