#!/usr/bin/python3

import collections
import math

from .action import expand, parse
from .graph import Node
from prolog.util import annotate, normalized, rename_vars, stringify, tokenize
from .util import get_line, avg, logistic

# Parse the sequence of actions in [trace] and return a directed acyclic graph
# representing development history. Each node represents a particular version
# of some line, and each edge represents a "line edit" (contiguous sequence of
# inserts/removes within a single line).
# Return a list of nodes with root as the first element. Also return sets of
# submissions (user-tested program versions) and queries in this attempt.
def trace_graph(trace):
    # Return values.
    nodes = [Node([0, 0, ()])]  # Node data: rank (Y), line no. (X), and tokens.
    submissions = set()         # Program versions at 'test' actions.
    queries = set()             # Queries run by the student.

    # State variables.
    leaves = {0: nodes[0]}  # Current leaf node for each line.
    rank = 1                # Rank (order / y-position) for the next node.
    code_next = ''          # Program code after applying the current action.
    done = False            # Set to True on first correct version.

    # Parse trace actions and ensure there is a separate action for each
    # inserted/removed character.
    try:
        actions = parse(trace)
        expand(actions)
    except:
        # Only a few traces fail to parse, so just skip them.
        actions = []

    for action_id, action in enumerate(actions):
        code = code_next
        code_next = action.apply(code)

        if action.type == 'test':
            submissions.add((code, action.total == action.passed))
            if action.total == action.passed:
                done = True

        elif action.type == 'solve' or action.type == 'solve_all':
            queries.add(action.query)

        elif action.type == 'insert' or action.type == 'remove':
            # Ignore edits after the first correct version.
            if done:
                continue

            # Number of the changed line.
            line = code[:action.offset].count('\n')
            # Current leaf node for this line.
            parent = leaves[line]
            # Tokens in this line after applying [action].
            tokens_next = tuple(tokenize(get_line(code_next, line)))

            # If a new node is inserted, clone each leaf into the next rank.
            # This makes it easier to print the graph for graphviz; when
            # analyzing the graph, duplicate nodes without siblings should be
            # ignored.
            new_leaves = {}

            if action.text == '\n':
                if action.type == 'insert':
                    tokens_next_right = tuple(tokenize(get_line(code_next, line+1)))

                    child_left = Node([rank, line, tokens_next])
                    parent.add_out(child_left)

                    child_right = Node([rank, line+1, tokens_next_right])
                    parent.add_out(child_right)

                    # Create new leaf nodes.
                    for i, leaf in leaves.items():
                        if i < line:
                            new_leaves[i] = Node([rank, i, leaf.data[2]])
                            leaf.add_out(new_leaves[i])
                        elif i > line:
                            new_leaves[i+1] = Node([rank, i+1, leaf.data[2]])
                            leaf.add_out(new_leaves[i+1])
                    new_leaves[line] = child_left
                    new_leaves[line+1] = child_right

                elif action.type == 'remove':
                    parent_right = leaves[line+1]

                    child = Node([rank, line, tokens_next])
                    parent_right.add_out(child)
                    parent.add_out(child)

                    # Create new leaf nodes.
                    for i, leaf in leaves.items():
                        if i < line:
                            new_leaves[i] = Node([rank, i, leaf.data[2]])
                            leaf.add_out(new_leaves[i])
                        elif i > line+1:
                            new_leaves[i-1] = Node([rank, i-1, leaf.data[2]])
                            leaf.add_out(new_leaves[i-1])
                    new_leaves[line] = child
            else:
                # Skip the node if the next action is insert/remove (except \n)
                # on the same line.
                if action_id < len(actions)-1:
                    action_next = actions[action_id+1]
                    if action_next.type in ('insert', 'remove'):
                        line_next = code_next[:action_next.offset].count('\n')
                        if action_next.text != '\n' and line == line_next:
                            continue

                # Skip the node if it is the same as the parent.
                if tokens_next == parent.data[2]:
                    continue

                child = Node([rank, line, tokens_next])
                parent.add_out(child)

                # Create new leaf nodes.
                for i, leaf in leaves.items():
                    if i != line:
                        new_leaves[i] = Node([rank, i, leaf.data[2]])
                        leaf.add_out(new_leaves[i])
                new_leaves[line] = child

            leaves = new_leaves
            nodes += leaves.values()
            rank += 1

    return nodes, submissions, queries

# Return a set of edits that appear in the trace_graph given by [nodes].
def graph_edits(nodes):
    edits = set()
    for a in nodes:
        a_data = a.data[2]
        for b in a.eout:
            b_data = b.data[2]

            # Normalize a → b into start → end. Reuse variable names from a
            # when normalizing b.
            var_names = {}
            start = normalized(a_data, var_names)
            end = normalized(b_data, var_names)

            if start == end:
                continue

            # An edit start → ε happens each time the user inserts \n; ignore
            # such cases.
            if not end and len(a.eout) > 1:
                continue

            # Skip edits where start/end is composed of more than one part.
            # TODO improve trace_graph to handle this instead.
            if [t for t in annotate(stringify(start)) if t.part > 0]:
                continue
            if [t for t in annotate(stringify(end)) if t.part > 0]:
                continue

            edits.add((start, end))
    return edits

# Build an edit graph for each trace and find "meaningful" (to be defined)
# edits. Return a dictionary of edits and their frequencies, and also
# submissions and queries in [traces].
def get_edits_from_traces(traces):
    # Return values: counts for observed edits, lines, submissions and queries.
    edits = collections.Counter()
    submissions = collections.Counter()
    queries = collections.Counter()
    names = collections.Counter()

    # Counts of traces where each line appears as a leaf / any node.
    n_leaf = collections.Counter()
    n_all = collections.Counter()

    for trace in traces:
        nodes, trace_submissions, trace_queries = trace_graph(trace)
        counted_tokens = []
        # Update the submissions/queries counters (use normalized variables).
        for (submission, correct) in trace_submissions:
            if correct:
                tokens = list(tokenize(submission))
                for token in tokens:
                    if token.type == 'NAME' and token.val not in counted_tokens:
                        names[token.val] += 1
                        counted_tokens.append(token.val)
                code = stringify(rename_vars(tokens))
                submissions[code] += 1
        for query in trace_queries:
            code = stringify(rename_vars(tokenize(query)))
            queries[code] += 1

        # Update the edit and leaf/node counters.
        edits.update(graph_edits(nodes))
        n_leaf.update(set([normalized(n.data[2]) for n in nodes if n.data[2] and not n.eout]))
        n_all.update(set([normalized(n.data[2]) for n in nodes if n.data[2]]))

    # Discard edits that only occur in one trace.
    singletons = [edit for edit in edits if edits[edit] < 2]
    for edit in singletons:
        del edits[edit]

    # Find the probability of each edit a → b.
    for (a, b), count in edits.items():
        p = 1.0
        if a:
            p *= 1 - (n_leaf[a] / (n_all[a]+1))
        if b:
            b_normal = normalized(b)
            p *= n_leaf[b_normal] / (n_all[b_normal]+1)
        if a and b:
            p = math.sqrt(p)
        edits[(a, b)] = p

    # Tweak the edit distribution to improve search.
    avg_p = avg(edits.values())
    for edit, p in edits.items():
        edits[edit] = logistic(p, k=3, x_0=avg_p)

    return edits, submissions, queries, names

def classify_edits(edits):
    inserts = {}
    removes = {}
    changes = {}
    for (before, after), cost in edits.items():
        if after and not before:
            inserts[after] = cost
        elif before and not after:
            removes[before] = cost
        else:
            changes[(before, after)] = cost
    return inserts, removes, changes


# Extract edits and other data from existing traces for each problem.
if __name__ == '__main__':
    import pickle
    from db.models import Problem, Solution

    # Ignore traces from these users.
    ignored_users = [
        1, # admin
        231, # test
        360, # test2
        358, # sasha
    ]

    edits, submissions, queries, names = {}, {}, {}, {}
    for problem in Problem.list():
        pid = problem.id
        solutions = Solution.filter(problem_id=pid, done=True)
        traces = [s.trace for s in solutions if s.codeq_user_id not in ignored_users]
        if not traces:
            print('No traces for {}'.format(problem.name))
            continue

        print('Analyzing traces for {}… '.format(problem.name), end='', flush=True)
        try:
            edits[pid], submissions[pid], queries[pid], names[pid] = get_edits_from_traces(traces)
            print('{} edits, {} submissions, {} queries, {} names'.format(
                len(edits[pid]), len(submissions[pid]), len(queries[pid]), len(names[pid])))
        except Exception as ex:
            print('error: {}'.format(ex))

    pickle.dump((edits, submissions, queries, names), open('edits.pickle', 'wb'))