From b85a6499d21beec2cb87830e63e6afef9569df1a Mon Sep 17 00:00:00 2001 From: Timotej Lazar Date: Sun, 8 Feb 2015 05:11:30 +0100 Subject: Simplify get_edits_from_traces --- monkey/edits.py | 76 ++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 31 deletions(-) (limited to 'monkey/edits.py') diff --git a/monkey/edits.py b/monkey/edits.py index a920bf5..480af5f 100644 --- a/monkey/edits.py +++ b/monkey/edits.py @@ -1,11 +1,12 @@ #!/usr/bin/python3 import collections +import math from .action import expand, parse from .graph import Node from prolog.util import rename_vars, stringify, tokenize -from .util import get_line +from .util import get_line, avg, logistic # A line edit is a contiguous sequences of actions within a single line. This # function takes a sequence of actions and builds a directed acyclic graph @@ -146,25 +147,31 @@ def get_paths(root, path=None, done=None): # edits. Return a dictionary of edits and their frequencies, and also # submissions and queries in [traces]. def get_edits_from_traces(traces): - # Helper function to remove trailing punctuation from lines. - def remove_punct(line): + # Helper function to remove trailing punctuation from lines and rename + # variables to A1,A2,A3,… (potentially using [var_names]). Return a tuple. + def normalize(line, var_names=None): + # Remove trailing punctuation. i = len(line) while i > 0: if line[i-1].type not in ('COMMA', 'PERIOD', 'SEMI'): break i -= 1 - return line[:i] + return tuple(rename_vars(line[:i], var_names)) # Return values: counts for observed edits, lines, submissions and queries. edits = collections.Counter() - lines = collections.Counter() submissions = collections.Counter() queries = collections.Counter() + # Counts of traces where each line appears as a leaf / any node. + n_leaf = collections.Counter() + n_all = collections.Counter() + for trace in traces: try: actions = parse(trace) except: + # Only a few traces fail to parse, so just ignore them. continue nodes, trace_submissions, trace_queries = edit_graph(actions) @@ -178,41 +185,48 @@ def get_edits_from_traces(traces): queries[code] += 1 # Get edits. - seen_edits = set() + trace_edits = set() for path in get_paths(nodes[0]): - for i in range(len(path)): + for i in range(1, len(path)): + # Normalize path[i-1] → path[i] into start → end. Reuse + # variable names from start when normalizing end. var_names = {} - start = tuple(rename_vars(remove_punct(path[i]), var_names)) - for j in range(len(path[i+1:])): - var_names_copy = {k: v for k, v in var_names.items()} - end = tuple(rename_vars(remove_punct(path[i+1+j]), var_names_copy)) - if start == end: - continue + start = normalize(path[i-1], var_names) + end = normalize(path[i], var_names) + # This should always succeed but check anyway. + if start != end: edit = (start, end) - if edit not in seen_edits: - seen_edits.add(edit) - edits[edit] += 1 - lines[start] += 1 + trace_edits.add(edit) + edits.update(trace_edits) + + # Update node counts. + n_leaf.update(set([normalize(n.data[2]) for n in nodes if n.data[2] and not n.eout])) + n_all.update(set([normalize(n.data[2]) for n in nodes if n.data[2]])) - # Discard rarely occurring edits. XXX only for testing + # Discard edits that only occur in one trace. singletons = [edit for edit in edits if edits[edit] < 2] for edit in singletons: - lines[edit[0]] -= edits[edit] del edits[edit] - # Get the probability of each edit given its "before" or "after" part. - max_insert_count = max([count for (before, after), count in edits.items() if not before]) - for before, after in edits: - if before: - edits[(before, after)] /= max(lines[before], 1) - else: - edits[(before, after)] /= max_insert_count - - # Normalize line frequencies. - if len(lines) > 0: - lines_max = max(max(lines.values()), 1) - lines = {line: count/lines_max for line, count in lines.items()} + # Find the probability of each edit a → b. + for (a, b), count in edits.items(): + p = 1.0 + if a: + p *= 1 - (n_leaf[a] / (n_all[a]+1)) + if b: + b_normal = normalize(b) + p *= n_leaf[b_normal] / (n_all[b_normal]+1) + if a and b: + p = math.sqrt(p) + edits[(a, b)] = p + + # Tweak the edit distribution to improve search. + avg_p = avg(edits.values()) + for edit, p in edits.items(): + edits[edit] = logistic(p, k=3, x_0=avg_p) + + lines = dict(n_leaf) return edits, lines, submissions, queries -- cgit v1.2.1