From b85a6499d21beec2cb87830e63e6afef9569df1a Mon Sep 17 00:00:00 2001
From: Timotej Lazar <timotej.lazar@araneo.org>
Date: Sun, 8 Feb 2015 05:11:30 +0100
Subject: Simplify get_edits_from_traces

---
 monkey/edits.py | 76 ++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 45 insertions(+), 31 deletions(-)

(limited to 'monkey/edits.py')

diff --git a/monkey/edits.py b/monkey/edits.py
index a920bf5..480af5f 100644
--- a/monkey/edits.py
+++ b/monkey/edits.py
@@ -1,11 +1,12 @@
 #!/usr/bin/python3
 
 import collections
+import math
 
 from .action import expand, parse
 from .graph import Node
 from prolog.util import rename_vars, stringify, tokenize
-from .util import get_line
+from .util import get_line, avg, logistic
 
 # A line edit is a contiguous sequences of actions within a single line. This
 # function takes a sequence of actions and builds a directed acyclic graph
@@ -146,25 +147,31 @@ def get_paths(root, path=None, done=None):
 # edits. Return a dictionary of edits and their frequencies, and also
 # submissions and queries in [traces].
 def get_edits_from_traces(traces):
-    # Helper function to remove trailing punctuation from lines.
-    def remove_punct(line):
+    # Helper function to remove trailing punctuation from lines and rename
+    # variables to A1,A2,A3,… (potentially using [var_names]). Return a tuple.
+    def normalize(line, var_names=None):
+        # Remove trailing punctuation.
         i = len(line)
         while i > 0:
             if line[i-1].type not in ('COMMA', 'PERIOD', 'SEMI'):
                 break
             i -= 1
-        return line[:i]
+        return tuple(rename_vars(line[:i], var_names))
 
     # Return values: counts for observed edits, lines, submissions and queries.
     edits = collections.Counter()
-    lines = collections.Counter()
     submissions = collections.Counter()
     queries = collections.Counter()
 
+    # Counts of traces where each line appears as a leaf / any node.
+    n_leaf = collections.Counter()
+    n_all = collections.Counter()
+
     for trace in traces:
         try:
             actions = parse(trace)
         except:
+            # Only a few traces fail to parse, so just ignore them.
             continue
         nodes, trace_submissions, trace_queries = edit_graph(actions)
 
@@ -178,41 +185,48 @@ def get_edits_from_traces(traces):
             queries[code] += 1
 
         # Get edits.
-        seen_edits = set()
+        trace_edits = set()
         for path in get_paths(nodes[0]):
-            for i in range(len(path)):
+            for i in range(1, len(path)):
+                # Normalize path[i-1] → path[i] into start → end. Reuse
+                # variable names from start when normalizing end.
                 var_names = {}
-                start = tuple(rename_vars(remove_punct(path[i]), var_names))
-                for j in range(len(path[i+1:])):
-                    var_names_copy = {k: v for k, v in var_names.items()}
-                    end = tuple(rename_vars(remove_punct(path[i+1+j]), var_names_copy))
-                    if start == end:
-                        continue
+                start = normalize(path[i-1], var_names)
+                end = normalize(path[i], var_names)
 
+                # This should always succeed but check anyway.
+                if start != end:
                     edit = (start, end)
-                    if edit not in seen_edits:
-                        seen_edits.add(edit)
-                        edits[edit] += 1
-                        lines[start] += 1
+                    trace_edits.add(edit)
+        edits.update(trace_edits)
+
+        # Update node counts.
+        n_leaf.update(set([normalize(n.data[2]) for n in nodes if n.data[2] and not n.eout]))
+        n_all.update(set([normalize(n.data[2]) for n in nodes if n.data[2]]))
 
-    # Discard rarely occurring edits. XXX only for testing
+    # Discard edits that only occur in one trace.
     singletons = [edit for edit in edits if edits[edit] < 2]
     for edit in singletons:
-        lines[edit[0]] -= edits[edit]
         del edits[edit]
 
-    # Get the probability of each edit given its "before" or "after" part.
-    max_insert_count = max([count for (before, after), count in edits.items() if not before])
-    for before, after in edits:
-        if before:
-            edits[(before, after)] /= max(lines[before], 1)
-        else:
-            edits[(before, after)] /= max_insert_count
-
-    # Normalize line frequencies.
-    if len(lines) > 0:
-        lines_max = max(max(lines.values()), 1)
-        lines = {line: count/lines_max for line, count in lines.items()}
+    # Find the probability of each edit a → b.
+    for (a, b), count in edits.items():
+        p = 1.0
+        if a:
+            p *= 1 - (n_leaf[a] / (n_all[a]+1))
+        if b:
+            b_normal = normalize(b)
+            p *= n_leaf[b_normal] / (n_all[b_normal]+1)
+        if a and b:
+            p = math.sqrt(p)
+        edits[(a, b)] = p
+
+    # Tweak the edit distribution to improve search.
+    avg_p = avg(edits.values())
+    for edit, p in edits.items():
+        edits[edit] = logistic(p, k=3, x_0=avg_p)
+
+    lines = dict(n_leaf)
 
     return edits, lines, submissions, queries
 
-- 
cgit v1.2.1